From: "H.J. Lu" <hjl.tools@gmail.com>
To: Noah Goldstein <goldstein.w.n@gmail.com>
Cc: libc-alpha@sourceware.org, carlos@systemhalted.org
Subject: Re: [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3
Date: Fri, 25 Mar 2022 12:57:48 -0700 [thread overview]
Message-ID: <CAMe9rOp1xEQW7ZY2vQoCXhAphVoRtk3xKK01zwEiYQHQWfx21g@mail.gmail.com> (raw)
In-Reply-To: <20220325183625.1170867-6-goldstein.w.n@gmail.com>
On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
> ---
> sysdeps/x86_64/multiarch/Makefile | 4 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 -
> sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 -
> sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 -
> sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 --------------------
> sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 -
> 6 files changed, 3572 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index a2ebc06c5f..292353bad7 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -42,13 +42,11 @@ sysdep_routines += \
> stpcpy-evex \
> stpcpy-sse2 \
> stpcpy-sse2-unaligned \
> - stpcpy-ssse3 \
> stpncpy-avx2 \
> stpncpy-avx2-rtm \
> stpncpy-c \
> stpncpy-evex \
> stpncpy-sse2-unaligned \
> - stpncpy-ssse3 \
> strcasecmp_l-avx2 \
> strcasecmp_l-avx2-rtm \
> strcasecmp_l-evex \
> @@ -79,7 +77,6 @@ sysdep_routines += \
> strcpy-evex \
> strcpy-sse2 \
> strcpy-sse2-unaligned \
> - strcpy-ssse3 \
> strcspn-c \
> strcspn-sse2 \
> strlen-avx2 \
> @@ -106,7 +103,6 @@ sysdep_routines += \
> strncpy-c \
> strncpy-evex \
> strncpy-sse2-unaligned \
> - strncpy-ssse3 \
> strnlen-avx2 \
> strnlen-avx2-rtm \
> strnlen-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 4133ed7e43..505b8002e0 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -389,8 +389,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/stpncpy.c. */
> IFUNC_IMPL (i, name, stpncpy,
> - IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
> - __stpncpy_ssse3)
> IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
> __stpncpy_avx2)
> IFUNC_IMPL_ADD (array, i, stpncpy,
> @@ -407,8 +405,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/stpcpy.c. */
> IFUNC_IMPL (i, name, stpcpy,
> - IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
> - __stpcpy_ssse3)
> IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
> __stpcpy_avx2)
> IFUNC_IMPL_ADD (array, i, stpcpy,
> @@ -557,8 +553,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> (CPU_FEATURE_USABLE (AVX512VL)
> && CPU_FEATURE_USABLE (AVX512BW)),
> __strcpy_evex)
> - IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
> - __strcpy_ssse3)
> IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
>
> @@ -634,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> (CPU_FEATURE_USABLE (AVX512VL)
> && CPU_FEATURE_USABLE (AVX512BW)),
> __strncpy_evex)
> - IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
> - __strncpy_ssse3)
> IFUNC_IMPL_ADD (array, i, strncpy, 1,
> __strncpy_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> deleted file mode 100644
> index d971c2da38..0000000000
> --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STPCPY
> -#define STRCPY __stpcpy_ssse3
> -#include "strcpy-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> deleted file mode 100644
> index 14ed16f6b5..0000000000
> --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY __stpncpy_ssse3
> -#include "strcpy-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
> deleted file mode 100644
> index f617a535cf..0000000000
> --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
> +++ /dev/null
> @@ -1,3550 +0,0 @@
> -/* strcpy with SSSE3
> - Copyright (C) 2011-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#if IS_IN (libc)
> -
> -# ifndef USE_AS_STRCAT
> -# include <sysdep.h>
> -
> -# ifndef STRCPY
> -# define STRCPY __strcpy_ssse3
> -# endif
> -
> - .section .text.ssse3,"ax",@progbits
> -ENTRY (STRCPY)
> -
> - mov %rsi, %rcx
> -# ifdef USE_AS_STRNCPY
> - mov %RDX_LP, %R8_LP
> -# endif
> - mov %rdi, %rdx
> -# ifdef USE_AS_STRNCPY
> - test %R8_LP, %R8_LP
> - jz L(Exit0)
> - cmp $8, %R8_LP
> - jbe L(StrncpyExit8Bytes)
> -# endif
> - cmpb $0, (%rcx)
> - jz L(Exit1)
> - cmpb $0, 1(%rcx)
> - jz L(Exit2)
> - cmpb $0, 2(%rcx)
> - jz L(Exit3)
> - cmpb $0, 3(%rcx)
> - jz L(Exit4)
> - cmpb $0, 4(%rcx)
> - jz L(Exit5)
> - cmpb $0, 5(%rcx)
> - jz L(Exit6)
> - cmpb $0, 6(%rcx)
> - jz L(Exit7)
> - cmpb $0, 7(%rcx)
> - jz L(Exit8)
> -# ifdef USE_AS_STRNCPY
> - cmp $16, %r8
> - jb L(StrncpyExit15Bytes)
> -# endif
> - cmpb $0, 8(%rcx)
> - jz L(Exit9)
> - cmpb $0, 9(%rcx)
> - jz L(Exit10)
> - cmpb $0, 10(%rcx)
> - jz L(Exit11)
> - cmpb $0, 11(%rcx)
> - jz L(Exit12)
> - cmpb $0, 12(%rcx)
> - jz L(Exit13)
> - cmpb $0, 13(%rcx)
> - jz L(Exit14)
> - cmpb $0, 14(%rcx)
> - jz L(Exit15)
> -# ifdef USE_AS_STRNCPY
> - cmp $16, %r8
> - je L(Exit16)
> -# endif
> - cmpb $0, 15(%rcx)
> - jz L(Exit16)
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> - mov %rcx, %rsi
> - sub $16, %r8
> - and $0xf, %rsi
> -
> -/* add 16 bytes rcx_offset to r8 */
> -
> - add %rsi, %r8
> -# endif
> - lea 16(%rcx), %rsi
> - and $-16, %rsi
> - pxor %xmm0, %xmm0
> - mov (%rcx), %r9
> - mov %r9, (%rdx)
> - pcmpeqb (%rsi), %xmm0
> - mov 8(%rcx), %r9
> - mov %r9, 8(%rdx)
> -
> -/* convert byte mask in xmm0 to bit mask */
> -
> - pmovmskb %xmm0, %rax
> - sub %rcx, %rsi
> -
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - mov %rdx, %rax
> - lea 16(%rdx), %rdx
> - and $-16, %rdx
> - sub %rdx, %rax
> -
> -# ifdef USE_AS_STRNCPY
> - add %rax, %rsi
> - lea -1(%rsi), %rsi
> - and $1<<31, %esi
> - test %rsi, %rsi
> - jnz L(ContinueCopy)
> - lea 16(%r8), %r8
> -
> -L(ContinueCopy):
> -# endif
> - sub %rax, %rcx
> - mov %rcx, %rax
> - and $0xf, %rax
> - mov $0, %rsi
> -
> -/* case: rcx_offset == rdx_offset */
> -
> - jz L(Align16Both)
> -
> - cmp $8, %rax
> - jae L(ShlHigh8)
> - cmp $1, %rax
> - je L(Shl1)
> - cmp $2, %rax
> - je L(Shl2)
> - cmp $3, %rax
> - je L(Shl3)
> - cmp $4, %rax
> - je L(Shl4)
> - cmp $5, %rax
> - je L(Shl5)
> - cmp $6, %rax
> - je L(Shl6)
> - jmp L(Shl7)
> -
> -L(ShlHigh8):
> - je L(Shl8)
> - cmp $9, %rax
> - je L(Shl9)
> - cmp $10, %rax
> - je L(Shl10)
> - cmp $11, %rax
> - je L(Shl11)
> - cmp $12, %rax
> - je L(Shl12)
> - cmp $13, %rax
> - je L(Shl13)
> - cmp $14, %rax
> - je L(Shl14)
> - jmp L(Shl15)
> -
> -L(Align16Both):
> - movaps (%rcx), %xmm1
> - movaps 16(%rcx), %xmm2
> - movaps %xmm1, (%rdx)
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm3
> - movaps %xmm2, (%rdx, %rsi)
> - pcmpeqb %xmm3, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm4
> - movaps %xmm3, (%rdx, %rsi)
> - pcmpeqb %xmm4, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm1
> - movaps %xmm4, (%rdx, %rsi)
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm2
> - movaps %xmm1, (%rdx, %rsi)
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm3
> - movaps %xmm2, (%rdx, %rsi)
> - pcmpeqb %xmm3, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps %xmm3, (%rdx, %rsi)
> - mov %rcx, %rax
> - lea 16(%rcx, %rsi), %rcx
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - lea 112(%r8, %rax), %r8
> -# endif
> - mov $-0x40, %rsi
> -
> - .p2align 4
> -L(Aligned64Loop):
> - movaps (%rcx), %xmm2
> - movaps %xmm2, %xmm4
> - movaps 16(%rcx), %xmm5
> - movaps 32(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 48(%rcx), %xmm7
> - pminub %xmm5, %xmm2
> - pminub %xmm7, %xmm3
> - pminub %xmm2, %xmm3
> - pcmpeqb %xmm0, %xmm3
> - pmovmskb %xmm3, %rax
> - lea 64(%rdx), %rdx
> - lea 64(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeaveCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Aligned64Leave)
> - movaps %xmm4, -64(%rdx)
> - movaps %xmm5, -48(%rdx)
> - movaps %xmm6, -32(%rdx)
> - movaps %xmm7, -16(%rdx)
> - jmp L(Aligned64Loop)
> -
> -L(Aligned64Leave):
> -# ifdef USE_AS_STRNCPY
> - lea 48(%r8), %r8
> -# endif
> - pcmpeqb %xmm4, %xmm0
> - pmovmskb %xmm0, %rax
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm5, %xmm0
> -# ifdef USE_AS_STRNCPY
> - lea -16(%r8), %r8
> -# endif
> - pmovmskb %xmm0, %rax
> - movaps %xmm4, -64(%rdx)
> - test %rax, %rax
> - lea 16(%rsi), %rsi
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm6, %xmm0
> -# ifdef USE_AS_STRNCPY
> - lea -16(%r8), %r8
> -# endif
> - pmovmskb %xmm0, %rax
> - movaps %xmm5, -48(%rdx)
> - test %rax, %rax
> - lea 16(%rsi), %rsi
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps %xmm6, -32(%rdx)
> - pcmpeqb %xmm7, %xmm0
> -# ifdef USE_AS_STRNCPY
> - lea -16(%r8), %r8
> -# endif
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl1):
> - movaps -1(%rcx), %xmm1
> - movaps 15(%rcx), %xmm2
> -L(Shl1Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit1Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl1LoopExit)
> -
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 31(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit1Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl1LoopExit)
> -
> - palignr $1, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 31(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit1Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl1LoopExit)
> -
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 31(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit1Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl1LoopExit)
> -
> - palignr $1, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 31(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -15(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -1(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl1LoopStart):
> - movaps 15(%rcx), %xmm2
> - movaps 31(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 47(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 63(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $1, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $1, %xmm3, %xmm4
> - jnz L(Shl1Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave1)
> -# endif
> - palignr $1, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl1LoopStart)
> -
> -L(Shl1LoopExit):
> - movdqu -1(%rcx), %xmm1
> - mov $15, %rsi
> - movdqu %xmm1, -1(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl2):
> - movaps -2(%rcx), %xmm1
> - movaps 14(%rcx), %xmm2
> -L(Shl2Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit2Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl2LoopExit)
> -
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 30(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit2Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl2LoopExit)
> -
> - palignr $2, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 30(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit2Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl2LoopExit)
> -
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 30(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit2Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl2LoopExit)
> -
> - palignr $2, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 30(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -14(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -2(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl2LoopStart):
> - movaps 14(%rcx), %xmm2
> - movaps 30(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 46(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 62(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $2, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $2, %xmm3, %xmm4
> - jnz L(Shl2Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave2)
> -# endif
> - palignr $2, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl2LoopStart)
> -
> -L(Shl2LoopExit):
> - movdqu -2(%rcx), %xmm1
> - mov $14, %rsi
> - movdqu %xmm1, -2(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl3):
> - movaps -3(%rcx), %xmm1
> - movaps 13(%rcx), %xmm2
> -L(Shl3Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit3Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl3LoopExit)
> -
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 29(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit3Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl3LoopExit)
> -
> - palignr $3, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 29(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit3Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl3LoopExit)
> -
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 29(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit3Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl3LoopExit)
> -
> - palignr $3, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 29(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -13(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -3(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl3LoopStart):
> - movaps 13(%rcx), %xmm2
> - movaps 29(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 45(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 61(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $3, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $3, %xmm3, %xmm4
> - jnz L(Shl3Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave3)
> -# endif
> - palignr $3, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl3LoopStart)
> -
> -L(Shl3LoopExit):
> - movdqu -3(%rcx), %xmm1
> - mov $13, %rsi
> - movdqu %xmm1, -3(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl4):
> - movaps -4(%rcx), %xmm1
> - movaps 12(%rcx), %xmm2
> -L(Shl4Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit4Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl4LoopExit)
> -
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 28(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit4Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl4LoopExit)
> -
> - palignr $4, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 28(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit4Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl4LoopExit)
> -
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 28(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit4Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl4LoopExit)
> -
> - palignr $4, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 28(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -12(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -4(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl4LoopStart):
> - movaps 12(%rcx), %xmm2
> - movaps 28(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 44(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 60(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $4, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $4, %xmm3, %xmm4
> - jnz L(Shl4Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave4)
> -# endif
> - palignr $4, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl4LoopStart)
> -
> -L(Shl4LoopExit):
> - movdqu -4(%rcx), %xmm1
> - mov $12, %rsi
> - movdqu %xmm1, -4(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl5):
> - movaps -5(%rcx), %xmm1
> - movaps 11(%rcx), %xmm2
> -L(Shl5Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit5Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl5LoopExit)
> -
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 27(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit5Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl5LoopExit)
> -
> - palignr $5, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 27(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit5Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl5LoopExit)
> -
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 27(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit5Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl5LoopExit)
> -
> - palignr $5, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 27(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -11(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -5(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl5LoopStart):
> - movaps 11(%rcx), %xmm2
> - movaps 27(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 43(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 59(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $5, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $5, %xmm3, %xmm4
> - jnz L(Shl5Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave5)
> -# endif
> - palignr $5, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl5LoopStart)
> -
> -L(Shl5LoopExit):
> - movdqu -5(%rcx), %xmm1
> - mov $11, %rsi
> - movdqu %xmm1, -5(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl6):
> - movaps -6(%rcx), %xmm1
> - movaps 10(%rcx), %xmm2
> -L(Shl6Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit6Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl6LoopExit)
> -
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 26(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit6Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl6LoopExit)
> -
> - palignr $6, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 26(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit6Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl6LoopExit)
> -
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 26(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit6Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl6LoopExit)
> -
> - palignr $6, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 26(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -10(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -6(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl6LoopStart):
> - movaps 10(%rcx), %xmm2
> - movaps 26(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 42(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 58(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $6, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $6, %xmm3, %xmm4
> - jnz L(Shl6Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave6)
> -# endif
> - palignr $6, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl6LoopStart)
> -
> -L(Shl6LoopExit):
> - mov (%rcx), %r9
> - mov 6(%rcx), %esi
> - mov %r9, (%rdx)
> - mov %esi, 6(%rdx)
> - mov $10, %rsi
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl7):
> - movaps -7(%rcx), %xmm1
> - movaps 9(%rcx), %xmm2
> -L(Shl7Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit7Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl7LoopExit)
> -
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 25(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit7Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl7LoopExit)
> -
> - palignr $7, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 25(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit7Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl7LoopExit)
> -
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 25(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit7Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl7LoopExit)
> -
> - palignr $7, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 25(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -9(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -7(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl7LoopStart):
> - movaps 9(%rcx), %xmm2
> - movaps 25(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 41(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 57(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $7, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $7, %xmm3, %xmm4
> - jnz L(Shl7Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave7)
> -# endif
> - palignr $7, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl7LoopStart)
> -
> -L(Shl7LoopExit):
> - mov (%rcx), %r9
> - mov 5(%rcx), %esi
> - mov %r9, (%rdx)
> - mov %esi, 5(%rdx)
> - mov $9, %rsi
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl8):
> - movaps -8(%rcx), %xmm1
> - movaps 8(%rcx), %xmm2
> -L(Shl8Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit8Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl8LoopExit)
> -
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 24(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit8Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl8LoopExit)
> -
> - palignr $8, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 24(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit8Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl8LoopExit)
> -
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 24(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit8Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl8LoopExit)
> -
> - palignr $8, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 24(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -8(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -8(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl8LoopStart):
> - movaps 8(%rcx), %xmm2
> - movaps 24(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 40(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 56(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $8, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $8, %xmm3, %xmm4
> - jnz L(Shl8Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave8)
> -# endif
> - palignr $8, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl8LoopStart)
> -
> -L(Shl8LoopExit):
> - mov (%rcx), %r9
> - mov $8, %rsi
> - mov %r9, (%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl9):
> - movaps -9(%rcx), %xmm1
> - movaps 7(%rcx), %xmm2
> -L(Shl9Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit9Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl9LoopExit)
> -
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 23(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit9Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl9LoopExit)
> -
> - palignr $9, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 23(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit9Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl9LoopExit)
> -
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 23(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit9Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl9LoopExit)
> -
> - palignr $9, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 23(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -7(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -9(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl9LoopStart):
> - movaps 7(%rcx), %xmm2
> - movaps 23(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 39(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 55(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $9, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $9, %xmm3, %xmm4
> - jnz L(Shl9Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave9)
> -# endif
> - palignr $9, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl9LoopStart)
> -
> -L(Shl9LoopExit):
> - mov -1(%rcx), %r9
> - mov $7, %rsi
> - mov %r9, -1(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl10):
> - movaps -10(%rcx), %xmm1
> - movaps 6(%rcx), %xmm2
> -L(Shl10Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit10Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl10LoopExit)
> -
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 22(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit10Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl10LoopExit)
> -
> - palignr $10, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 22(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit10Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl10LoopExit)
> -
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 22(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit10Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl10LoopExit)
> -
> - palignr $10, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 22(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -6(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -10(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl10LoopStart):
> - movaps 6(%rcx), %xmm2
> - movaps 22(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 38(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 54(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $10, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $10, %xmm3, %xmm4
> - jnz L(Shl10Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave10)
> -# endif
> - palignr $10, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl10LoopStart)
> -
> -L(Shl10LoopExit):
> - mov -2(%rcx), %r9
> - mov $6, %rsi
> - mov %r9, -2(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl11):
> - movaps -11(%rcx), %xmm1
> - movaps 5(%rcx), %xmm2
> -L(Shl11Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit11Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl11LoopExit)
> -
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 21(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit11Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl11LoopExit)
> -
> - palignr $11, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 21(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit11Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl11LoopExit)
> -
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 21(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit11Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl11LoopExit)
> -
> - palignr $11, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 21(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -5(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -11(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl11LoopStart):
> - movaps 5(%rcx), %xmm2
> - movaps 21(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 37(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 53(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $11, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $11, %xmm3, %xmm4
> - jnz L(Shl11Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave11)
> -# endif
> - palignr $11, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl11LoopStart)
> -
> -L(Shl11LoopExit):
> - mov -3(%rcx), %r9
> - mov $5, %rsi
> - mov %r9, -3(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl12):
> - movaps -12(%rcx), %xmm1
> - movaps 4(%rcx), %xmm2
> -L(Shl12Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit12Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl12LoopExit)
> -
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 20(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit12Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl12LoopExit)
> -
> - palignr $12, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 20(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit12Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl12LoopExit)
> -
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 20(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit12Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl12LoopExit)
> -
> - palignr $12, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 20(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -4(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -12(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl12LoopStart):
> - movaps 4(%rcx), %xmm2
> - movaps 20(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 36(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 52(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $12, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $12, %xmm3, %xmm4
> - jnz L(Shl12Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave12)
> -# endif
> - palignr $12, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl12LoopStart)
> -
> -L(Shl12LoopExit):
> - mov (%rcx), %r9d
> - mov $4, %rsi
> - mov %r9d, (%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl13):
> - movaps -13(%rcx), %xmm1
> - movaps 3(%rcx), %xmm2
> -L(Shl13Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit13Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl13LoopExit)
> -
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 19(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit13Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl13LoopExit)
> -
> - palignr $13, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 19(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit13Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl13LoopExit)
> -
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 19(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit13Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl13LoopExit)
> -
> - palignr $13, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 19(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -3(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -13(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl13LoopStart):
> - movaps 3(%rcx), %xmm2
> - movaps 19(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 35(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 51(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $13, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $13, %xmm3, %xmm4
> - jnz L(Shl13Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave13)
> -# endif
> - palignr $13, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl13LoopStart)
> -
> -L(Shl13LoopExit):
> - mov -1(%rcx), %r9d
> - mov $3, %rsi
> - mov %r9d, -1(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl14):
> - movaps -14(%rcx), %xmm1
> - movaps 2(%rcx), %xmm2
> -L(Shl14Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit14Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl14LoopExit)
> -
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 18(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit14Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl14LoopExit)
> -
> - palignr $14, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 18(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit14Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl14LoopExit)
> -
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 18(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit14Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl14LoopExit)
> -
> - palignr $14, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 18(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -2(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -14(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl14LoopStart):
> - movaps 2(%rcx), %xmm2
> - movaps 18(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 34(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 50(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $14, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $14, %xmm3, %xmm4
> - jnz L(Shl14Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave14)
> -# endif
> - palignr $14, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl14LoopStart)
> -
> -L(Shl14LoopExit):
> - mov -2(%rcx), %r9d
> - mov $2, %rsi
> - mov %r9d, -2(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl15):
> - movaps -15(%rcx), %xmm1
> - movaps 1(%rcx), %xmm2
> -L(Shl15Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit15Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl15LoopExit)
> -
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 17(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit15Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl15LoopExit)
> -
> - palignr $15, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 17(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit15Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl15LoopExit)
> -
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 17(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit15Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl15LoopExit)
> -
> - palignr $15, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 17(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -1(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -15(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl15LoopStart):
> - movaps 1(%rcx), %xmm2
> - movaps 17(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 33(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 49(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $15, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $15, %xmm3, %xmm4
> - jnz L(Shl15Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave15)
> -# endif
> - palignr $15, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl15LoopStart)
> -
> -L(Shl15LoopExit):
> - mov -3(%rcx), %r9d
> - mov $1, %rsi
> - mov %r9d, -3(%rdx)
> -# ifdef USE_AS_STRCAT
> - jmp L(CopyFrom1To16Bytes)
> -# endif
> -
> -# ifndef USE_AS_STRCAT
> -
> - .p2align 4
> -L(CopyFrom1To16Bytes):
> -# ifdef USE_AS_STRNCPY
> - add $16, %r8
> -# endif
> - add %rsi, %rdx
> - add %rsi, %rcx
> -
> - test %al, %al
> - jz L(ExitHigh)
> - test $0x01, %al
> - jnz L(Exit1)
> - test $0x02, %al
> - jnz L(Exit2)
> - test $0x04, %al
> - jnz L(Exit3)
> - test $0x08, %al
> - jnz L(Exit4)
> - test $0x10, %al
> - jnz L(Exit5)
> - test $0x20, %al
> - jnz L(Exit6)
> - test $0x40, %al
> - jnz L(Exit7)
> -
> - .p2align 4
> -L(Exit8):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 7(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $8, %r8
> - lea 8(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(ExitHigh):
> - test $0x01, %ah
> - jnz L(Exit9)
> - test $0x02, %ah
> - jnz L(Exit10)
> - test $0x04, %ah
> - jnz L(Exit11)
> - test $0x08, %ah
> - jnz L(Exit12)
> - test $0x10, %ah
> - jnz L(Exit13)
> - test $0x20, %ah
> - jnz L(Exit14)
> - test $0x40, %ah
> - jnz L(Exit15)
> -
> - .p2align 4
> -L(Exit16):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 8(%rcx), %rax
> - mov %rax, 8(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 15(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - lea 16(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> -# ifdef USE_AS_STRNCPY
> -
> - .p2align 4
> -L(CopyFrom1To16BytesCase2):
> - add $16, %r8
> - add %rsi, %rcx
> - lea (%rsi, %rdx), %rsi
> - lea -9(%r8), %rdx
> - and $1<<7, %dh
> - or %al, %dh
> - test %dh, %dh
> - lea (%rsi), %rdx
> - jz L(ExitHighCase2)
> -
> - cmp $1, %r8
> - je L(Exit1)
> - test $0x01, %al
> - jnz L(Exit1)
> - cmp $2, %r8
> - je L(Exit2)
> - test $0x02, %al
> - jnz L(Exit2)
> - cmp $3, %r8
> - je L(Exit3)
> - test $0x04, %al
> - jnz L(Exit3)
> - cmp $4, %r8
> - je L(Exit4)
> - test $0x08, %al
> - jnz L(Exit4)
> - cmp $5, %r8
> - je L(Exit5)
> - test $0x10, %al
> - jnz L(Exit5)
> - cmp $6, %r8
> - je L(Exit6)
> - test $0x20, %al
> - jnz L(Exit6)
> - cmp $7, %r8
> - je L(Exit7)
> - test $0x40, %al
> - jnz L(Exit7)
> - jmp L(Exit8)
> -
> - .p2align 4
> -L(ExitHighCase2):
> - cmp $9, %r8
> - je L(Exit9)
> - test $0x01, %ah
> - jnz L(Exit9)
> - cmp $10, %r8
> - je L(Exit10)
> - test $0x02, %ah
> - jnz L(Exit10)
> - cmp $11, %r8
> - je L(Exit11)
> - test $0x04, %ah
> - jnz L(Exit11)
> - cmp $12, %r8
> - je L(Exit12)
> - test $0x8, %ah
> - jnz L(Exit12)
> - cmp $13, %r8
> - je L(Exit13)
> - test $0x10, %ah
> - jnz L(Exit13)
> - cmp $14, %r8
> - je L(Exit14)
> - test $0x20, %ah
> - jnz L(Exit14)
> - cmp $15, %r8
> - je L(Exit15)
> - test $0x40, %ah
> - jnz L(Exit15)
> - jmp L(Exit16)
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> -
> - .p2align 4
> -L(CopyFrom1To16BytesCase3):
> - add $16, %r8
> - add %rsi, %rdx
> - add %rsi, %rcx
> -
> - cmp $16, %r8
> - je L(Exit16)
> - cmp $8, %r8
> - je L(Exit8)
> - jg L(More8Case3)
> - cmp $4, %r8
> - je L(Exit4)
> - jg L(More4Case3)
> - cmp $2, %r8
> - jl L(Exit1)
> - je L(Exit2)
> - jg L(Exit3)
> -L(More8Case3): /* but less than 16 */
> - cmp $12, %r8
> - je L(Exit12)
> - jl L(Less12Case3)
> - cmp $14, %r8
> - jl L(Exit13)
> - je L(Exit14)
> - jg L(Exit15)
> -L(More4Case3): /* but less than 8 */
> - cmp $6, %r8
> - jl L(Exit5)
> - je L(Exit6)
> - jg L(Exit7)
> -L(Less12Case3): /* but more than 8 */
> - cmp $10, %r8
> - jl L(Exit9)
> - je L(Exit10)
> - jg L(Exit11)
> -# endif
> -
> - .p2align 4
> -L(Exit1):
> - movb (%rcx), %al
> - movb %al, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea (%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $1, %r8
> - lea 1(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit2):
> - movw (%rcx), %ax
> - movw %ax, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 1(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $2, %r8
> - lea 2(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit3):
> - movw (%rcx), %ax
> - movw %ax, (%rdx)
> - movb 2(%rcx), %al
> - movb %al, 2(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 2(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $3, %r8
> - lea 3(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit4):
> - movl (%rcx), %eax
> - movl %eax, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 3(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $4, %r8
> - lea 4(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit5):
> - movl (%rcx), %eax
> - movl %eax, (%rdx)
> - movb 4(%rcx), %al
> - movb %al, 4(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 4(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $5, %r8
> - lea 5(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit6):
> - movl (%rcx), %eax
> - movl %eax, (%rdx)
> - movw 4(%rcx), %ax
> - movw %ax, 4(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 5(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $6, %r8
> - lea 6(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit7):
> - movl (%rcx), %eax
> - movl %eax, (%rdx)
> - movl 3(%rcx), %eax
> - movl %eax, 3(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 6(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $7, %r8
> - lea 7(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit9):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 5(%rcx), %eax
> - mov %eax, 5(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 8(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $9, %r8
> - lea 9(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit10):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 6(%rcx), %eax
> - mov %eax, 6(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 9(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $10, %r8
> - lea 10(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit11):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 7(%rcx), %eax
> - mov %eax, 7(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 10(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $11, %r8
> - lea 11(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit12):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 8(%rcx), %eax
> - mov %eax, 8(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 11(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $12, %r8
> - lea 12(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit13):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 5(%rcx), %rax
> - mov %rax, 5(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 12(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $13, %r8
> - lea 13(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit14):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 6(%rcx), %rax
> - mov %rax, 6(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 13(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $14, %r8
> - lea 14(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit15):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 7(%rcx), %rax
> - mov %rax, 7(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 14(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $15, %r8
> - lea 15(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> -# ifdef USE_AS_STRNCPY
> - .p2align 4
> -L(Fill0):
> - ret
> -
> - .p2align 4
> -L(Fill1):
> - movb %dl, (%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill2):
> - movw %dx, (%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill3):
> - movw %dx, (%rcx)
> - movb %dl, 2(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill4):
> - movl %edx, (%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill5):
> - movl %edx, (%rcx)
> - movb %dl, 4(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill6):
> - movl %edx, (%rcx)
> - movw %dx, 4(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill7):
> - movl %edx, (%rcx)
> - movl %edx, 3(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill8):
> - mov %rdx, (%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill9):
> - mov %rdx, (%rcx)
> - movb %dl, 8(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill10):
> - mov %rdx, (%rcx)
> - movw %dx, 8(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill11):
> - mov %rdx, (%rcx)
> - movl %edx, 7(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill12):
> - mov %rdx, (%rcx)
> - movl %edx, 8(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill13):
> - mov %rdx, (%rcx)
> - mov %rdx, 5(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill14):
> - mov %rdx, (%rcx)
> - mov %rdx, 6(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill15):
> - mov %rdx, (%rcx)
> - mov %rdx, 7(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill16):
> - mov %rdx, (%rcx)
> - mov %rdx, 8(%rcx)
> - ret
> -
> - .p2align 4
> -L(StrncpyFillExit1):
> - lea 16(%r8), %r8
> -L(FillFrom1To16Bytes):
> - test %r8, %r8
> - jz L(Fill0)
> - cmp $16, %r8
> - je L(Fill16)
> - cmp $8, %r8
> - je L(Fill8)
> - jg L(FillMore8)
> - cmp $4, %r8
> - je L(Fill4)
> - jg L(FillMore4)
> - cmp $2, %r8
> - jl L(Fill1)
> - je L(Fill2)
> - jg L(Fill3)
> -L(FillMore8): /* but less than 16 */
> - cmp $12, %r8
> - je L(Fill12)
> - jl L(FillLess12)
> - cmp $14, %r8
> - jl L(Fill13)
> - je L(Fill14)
> - jg L(Fill15)
> -L(FillMore4): /* but less than 8 */
> - cmp $6, %r8
> - jl L(Fill5)
> - je L(Fill6)
> - jg L(Fill7)
> -L(FillLess12): /* but more than 8 */
> - cmp $10, %r8
> - jl L(Fill9)
> - je L(Fill10)
> - jmp L(Fill11)
> -
> - .p2align 4
> -L(StrncpyFillTailWithZero1):
> - xor %rdx, %rdx
> - sub $16, %r8
> - jbe L(StrncpyFillExit1)
> -
> - pxor %xmm0, %xmm0
> - mov %rdx, (%rcx)
> - mov %rdx, 8(%rcx)
> -
> - lea 16(%rcx), %rcx
> -
> - mov %rcx, %rdx
> - and $0xf, %rdx
> - sub %rdx, %rcx
> - add %rdx, %r8
> - xor %rdx, %rdx
> - sub $64, %r8
> - jb L(StrncpyFillLess64)
> -
> -L(StrncpyFillLoopMovdqa):
> - movdqa %xmm0, (%rcx)
> - movdqa %xmm0, 16(%rcx)
> - movdqa %xmm0, 32(%rcx)
> - movdqa %xmm0, 48(%rcx)
> - lea 64(%rcx), %rcx
> - sub $64, %r8
> - jae L(StrncpyFillLoopMovdqa)
> -
> -L(StrncpyFillLess64):
> - add $32, %r8
> - jl L(StrncpyFillLess32)
> - movdqa %xmm0, (%rcx)
> - movdqa %xmm0, 16(%rcx)
> - lea 32(%rcx), %rcx
> - sub $16, %r8
> - jl L(StrncpyFillExit1)
> - movdqa %xmm0, (%rcx)
> - lea 16(%rcx), %rcx
> - jmp L(FillFrom1To16Bytes)
> -
> -L(StrncpyFillLess32):
> - add $16, %r8
> - jl L(StrncpyFillExit1)
> - movdqa %xmm0, (%rcx)
> - lea 16(%rcx), %rcx
> - jmp L(FillFrom1To16Bytes)
> -
> - .p2align 4
> -L(Exit0):
> - mov %rdx, %rax
> - ret
> -
> - .p2align 4
> -L(StrncpyExit15Bytes):
> - cmp $9, %r8
> - je L(Exit9)
> - cmpb $0, 8(%rcx)
> - jz L(Exit9)
> - cmp $10, %r8
> - je L(Exit10)
> - cmpb $0, 9(%rcx)
> - jz L(Exit10)
> - cmp $11, %r8
> - je L(Exit11)
> - cmpb $0, 10(%rcx)
> - jz L(Exit11)
> - cmp $12, %r8
> - je L(Exit12)
> - cmpb $0, 11(%rcx)
> - jz L(Exit12)
> - cmp $13, %r8
> - je L(Exit13)
> - cmpb $0, 12(%rcx)
> - jz L(Exit13)
> - cmp $14, %r8
> - je L(Exit14)
> - cmpb $0, 13(%rcx)
> - jz L(Exit14)
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 7(%rcx), %rax
> - mov %rax, 7(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 14(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# else
> - mov %rdi, %rax
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit8Bytes):
> - cmp $1, %r8
> - je L(Exit1)
> - cmpb $0, (%rcx)
> - jz L(Exit1)
> - cmp $2, %r8
> - je L(Exit2)
> - cmpb $0, 1(%rcx)
> - jz L(Exit2)
> - cmp $3, %r8
> - je L(Exit3)
> - cmpb $0, 2(%rcx)
> - jz L(Exit3)
> - cmp $4, %r8
> - je L(Exit4)
> - cmpb $0, 3(%rcx)
> - jz L(Exit4)
> - cmp $5, %r8
> - je L(Exit5)
> - cmpb $0, 4(%rcx)
> - jz L(Exit5)
> - cmp $6, %r8
> - je L(Exit6)
> - cmpb $0, 5(%rcx)
> - jz L(Exit6)
> - cmp $7, %r8
> - je L(Exit7)
> - cmpb $0, 6(%rcx)
> - jz L(Exit7)
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 7(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# else
> - mov %rdi, %rax
> -# endif
> - ret
> -
> -# endif
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> - .p2align 4
> -L(StrncpyLeaveCase2OrCase3):
> - test %rax, %rax
> - jnz L(Aligned64LeaveCase2)
> -
> -L(Aligned64LeaveCase3):
> - lea 64(%r8), %r8
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase3)
> - movaps %xmm4, -64(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase3)
> - movaps %xmm5, -48(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase3)
> - movaps %xmm6, -32(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> - jmp L(CopyFrom1To16BytesCase3)
> -
> -L(Aligned64LeaveCase2):
> - pcmpeqb %xmm4, %xmm0
> - pmovmskb %xmm0, %rax
> - add $48, %r8
> - jle L(CopyFrom1To16BytesCase2OrCase3)
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm5, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm4, -64(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm6, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm5, -48(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm7, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm6, -32(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> - jmp L(CopyFrom1To16BytesCase2)
> -/*--------------------------------------------------*/
> - .p2align 4
> -L(StrncpyExit1Case2OrCase3):
> - movdqu -1(%rcx), %xmm0
> - movdqu %xmm0, -1(%rdx)
> - mov $15, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit2Case2OrCase3):
> - movdqu -2(%rcx), %xmm0
> - movdqu %xmm0, -2(%rdx)
> - mov $14, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit3Case2OrCase3):
> - movdqu -3(%rcx), %xmm0
> - movdqu %xmm0, -3(%rdx)
> - mov $13, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit4Case2OrCase3):
> - movdqu -4(%rcx), %xmm0
> - movdqu %xmm0, -4(%rdx)
> - mov $12, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit5Case2OrCase3):
> - movdqu -5(%rcx), %xmm0
> - movdqu %xmm0, -5(%rdx)
> - mov $11, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit6Case2OrCase3):
> - mov (%rcx), %rsi
> - mov 6(%rcx), %r9d
> - mov %r9d, 6(%rdx)
> - mov %rsi, (%rdx)
> - test %rax, %rax
> - mov $10, %rsi
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit7Case2OrCase3):
> - mov (%rcx), %rsi
> - mov 5(%rcx), %r9d
> - mov %r9d, 5(%rdx)
> - mov %rsi, (%rdx)
> - test %rax, %rax
> - mov $9, %rsi
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit8Case2OrCase3):
> - mov (%rcx), %r9
> - mov $8, %rsi
> - mov %r9, (%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit9Case2OrCase3):
> - mov -1(%rcx), %r9
> - mov $7, %rsi
> - mov %r9, -1(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit10Case2OrCase3):
> - mov -2(%rcx), %r9
> - mov $6, %rsi
> - mov %r9, -2(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit11Case2OrCase3):
> - mov -3(%rcx), %r9
> - mov $5, %rsi
> - mov %r9, -3(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit12Case2OrCase3):
> - mov (%rcx), %r9d
> - mov $4, %rsi
> - mov %r9d, (%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit13Case2OrCase3):
> - mov -1(%rcx), %r9d
> - mov $3, %rsi
> - mov %r9d, -1(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit14Case2OrCase3):
> - mov -2(%rcx), %r9d
> - mov $2, %rsi
> - mov %r9d, -2(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit15Case2OrCase3):
> - mov -3(%rcx), %r9d
> - mov $1, %rsi
> - mov %r9d, -3(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave1):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit1)
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 31(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit1)
> - palignr $1, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit1)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit1)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit1):
> - lea 15(%rdx, %rsi), %rdx
> - lea 15(%rcx, %rsi), %rcx
> - mov -15(%rcx), %rsi
> - mov -8(%rcx), %rax
> - mov %rsi, -15(%rdx)
> - mov %rax, -8(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave2):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit2)
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 30(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit2)
> - palignr $2, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit2)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit2)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit2):
> - lea 14(%rdx, %rsi), %rdx
> - lea 14(%rcx, %rsi), %rcx
> - mov -14(%rcx), %rsi
> - mov -8(%rcx), %rax
> - mov %rsi, -14(%rdx)
> - mov %rax, -8(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave3):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit3)
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 29(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit3)
> - palignr $3, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit3)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit3)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit3):
> - lea 13(%rdx, %rsi), %rdx
> - lea 13(%rcx, %rsi), %rcx
> - mov -13(%rcx), %rsi
> - mov -8(%rcx), %rax
> - mov %rsi, -13(%rdx)
> - mov %rax, -8(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave4):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit4)
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 28(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit4)
> - palignr $4, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit4)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit4)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit4):
> - lea 12(%rdx, %rsi), %rdx
> - lea 12(%rcx, %rsi), %rcx
> - mov -12(%rcx), %rsi
> - mov -4(%rcx), %eax
> - mov %rsi, -12(%rdx)
> - mov %eax, -4(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave5):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit5)
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 27(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit5)
> - palignr $5, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit5)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit5)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit5):
> - lea 11(%rdx, %rsi), %rdx
> - lea 11(%rcx, %rsi), %rcx
> - mov -11(%rcx), %rsi
> - mov -4(%rcx), %eax
> - mov %rsi, -11(%rdx)
> - mov %eax, -4(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave6):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit6)
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 26(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit6)
> - palignr $6, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit6)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit6)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit6):
> - lea 10(%rdx, %rsi), %rdx
> - lea 10(%rcx, %rsi), %rcx
> - mov -10(%rcx), %rsi
> - movw -2(%rcx), %ax
> - mov %rsi, -10(%rdx)
> - movw %ax, -2(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave7):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit7)
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 25(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit7)
> - palignr $7, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit7)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit7)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit7):
> - lea 9(%rdx, %rsi), %rdx
> - lea 9(%rcx, %rsi), %rcx
> - mov -9(%rcx), %rsi
> - movb -1(%rcx), %ah
> - mov %rsi, -9(%rdx)
> - movb %ah, -1(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave8):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit8)
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 24(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit8)
> - palignr $8, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit8)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit8)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit8):
> - lea 8(%rdx, %rsi), %rdx
> - lea 8(%rcx, %rsi), %rcx
> - mov -8(%rcx), %rax
> - xor %rsi, %rsi
> - mov %rax, -8(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave9):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit9)
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 23(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit9)
> - palignr $9, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit9)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit9)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit9):
> - lea 7(%rdx, %rsi), %rdx
> - lea 7(%rcx, %rsi), %rcx
> - mov -8(%rcx), %rax
> - xor %rsi, %rsi
> - mov %rax, -8(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave10):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit10)
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 22(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit10)
> - palignr $10, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit10)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit10)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit10):
> - lea 6(%rdx, %rsi), %rdx
> - lea 6(%rcx, %rsi), %rcx
> - mov -8(%rcx), %rax
> - xor %rsi, %rsi
> - mov %rax, -8(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave11):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit11)
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 21(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit11)
> - palignr $11, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit11)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit11)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit11):
> - lea 5(%rdx, %rsi), %rdx
> - lea 5(%rcx, %rsi), %rcx
> - mov -8(%rcx), %rax
> - xor %rsi, %rsi
> - mov %rax, -8(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave12):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit12)
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 20(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit12)
> - palignr $12, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit12)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit12)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit12):
> - lea 4(%rdx, %rsi), %rdx
> - lea 4(%rcx, %rsi), %rcx
> - mov -4(%rcx), %eax
> - xor %rsi, %rsi
> - mov %eax, -4(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave13):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit13)
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 19(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit13)
> - palignr $13, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit13)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit13)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit13):
> - lea 3(%rdx, %rsi), %rdx
> - lea 3(%rcx, %rsi), %rcx
> - mov -4(%rcx), %eax
> - xor %rsi, %rsi
> - mov %eax, -4(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave14):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit14)
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 18(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit14)
> - palignr $14, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit14)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit14)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit14):
> - lea 2(%rdx, %rsi), %rdx
> - lea 2(%rcx, %rsi), %rcx
> - movw -2(%rcx), %ax
> - xor %rsi, %rsi
> - movw %ax, -2(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave15):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit15)
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 17(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit15)
> - palignr $15, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit15)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit15)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit15):
> - lea 1(%rdx, %rsi), %rdx
> - lea 1(%rcx, %rsi), %rcx
> - movb -1(%rcx), %ah
> - xor %rsi, %rsi
> - movb %ah, -1(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> -# endif
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# endif
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
> deleted file mode 100644
> index bf82ee447d..0000000000
> --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STRNCPY
> -#define STRCPY __strncpy_ssse3
> -#include "strcpy-ssse3.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
next prev parent reply other threads:[~2022-03-25 19:58 UTC|newest]
Thread overview: 49+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-03-25 19:55 ` H.J. Lu
2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
2022-04-10 0:57 ` [PATCH v4 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 1/6] " Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3 Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 5/6] x86: Remove memcmp-sse4.S Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-04-14 16:47 ` [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-04-14 18:05 ` H.J. Lu
2022-04-14 16:47 ` [PATCH v5 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
2022-04-14 18:06 ` H.J. Lu
2022-04-14 16:47 ` [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
2022-04-14 18:10 ` H.J. Lu
2022-04-14 16:47 ` [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
2022-04-14 18:13 ` H.J. Lu
2022-04-14 16:47 ` [PATCH v5 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein
2022-04-14 18:04 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu
2022-03-25 18:36 ` [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
2022-03-25 19:56 ` H.J. Lu
2022-03-25 18:36 ` [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
2022-03-25 19:56 ` H.J. Lu
2022-03-25 18:36 ` [PATCH v1 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
2022-03-25 19:57 ` H.J. Lu
2022-03-25 18:36 ` [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
2022-03-25 19:57 ` H.J. Lu [this message]
2022-03-25 19:54 ` [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu
2022-03-25 20:34 ` Andreas Schwab
2022-03-25 20:40 ` Noah Goldstein
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CAMe9rOp1xEQW7ZY2vQoCXhAphVoRtk3xKK01zwEiYQHQWfx21g@mail.gmail.com \
--to=hjl.tools@gmail.com \
--cc=carlos@systemhalted.org \
--cc=goldstein.w.n@gmail.com \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).