* [PATCH v1 2/4] x86: Move strcmp SSE2 implementation to multiarch/strcmp-sse2.S
2022-07-12 19:28 [PATCH v1 1/4] x86: Rename STRCASECMP_NONASCII macro to STRCASECMP_L_NONASCII Noah Goldstein
@ 2022-07-12 19:28 ` Noah Goldstein
2022-07-12 23:57 ` H.J. Lu
2022-07-12 19:28 ` [PATCH v1 3/4] x86: Move wcscmp SSE2 implementation to multiarch/wcscmp-sse2.S Noah Goldstein
` (2 subsequent siblings)
3 siblings, 1 reply; 8+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:28 UTC (permalink / raw)
To: libc-alpha
This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.
Because strcmp-sse2.S implements so many functions (more from
avx2/evex/sse42) add a new file 'strcmp-naming.h' to assist in
getting the correct symbol name for all the function across
multiarch/non-multiarch builds.
Tested build on x86_64 and x86_32 with/without multiarch.
---
sysdeps/x86_64/multiarch/rtld-strcmp.S | 18 +
sysdeps/x86_64/multiarch/rtld-strncmp.S | 18 +
sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S | 5 +-
sysdeps/x86_64/multiarch/strcmp-naming.h | 68 +
sysdeps/x86_64/multiarch/strcmp-sse2.S | 2140 ++++++++++++++++-
sysdeps/x86_64/multiarch/strncase_l-sse2.S | 5 +-
sysdeps/x86_64/multiarch/strncmp-sse2.S | 12 +-
sysdeps/x86_64/strcasecmp_l.S | 11 +-
sysdeps/x86_64/strcmp.S | 2147 +-----------------
sysdeps/x86_64/strncase_l.S | 11 +-
sysdeps/x86_64/strncmp.S | 7 +-
11 files changed, 2264 insertions(+), 2178 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/rtld-strcmp.S
create mode 100644 sysdeps/x86_64/multiarch/rtld-strncmp.S
create mode 100644 sysdeps/x86_64/multiarch/strcmp-naming.h
diff --git a/sysdeps/x86_64/multiarch/rtld-strcmp.S b/sysdeps/x86_64/multiarch/rtld-strcmp.S
new file mode 100644
index 0000000000..207078bdcc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strcmp.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/rtld-strncmp.S b/sysdeps/x86_64/multiarch/rtld-strncmp.S
new file mode 100644
index 0000000000..ac32150406
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strncmp.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "../strncmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S
index 2360d104dd..a2b5741399 100644
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S
@@ -16,8 +16,5 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#define STRCMP __strcasecmp_l_sse2
#define USE_AS_STRCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define __strcasecmp __strcasecmp_sse2
-#include <sysdeps/x86_64/strcmp.S>
+#include "strcmp-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-naming.h b/sysdeps/x86_64/multiarch/strcmp-naming.h
new file mode 100644
index 0000000000..6a7529b6a4
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-naming.h
@@ -0,0 +1,68 @@
+#ifndef _STRCMP_NAMING_H_
+#define _STRCMP_NAMING_H_
+
+/* Utility macros. */
+#define STRCMP_SUFFIX(x, y) x##y
+#define STRCMP_NAME(x, y) STRCMP_SUFFIX (x, y)
+
+/* Setup base of all definitions. */
+#define STRNCASECMP_BASE __strncasecmp
+#define STRCASECMP_BASE __strcasecmp
+#define WCSCMP_BASE __wcscmp
+
+#if defined USE_MULTIARCH && IS_IN (libc)
+# define WCSNCMP_BASE __wcsncmp
+# define STRNCMP_BASE __strncmp
+# define STRCMP_BASE __strcmp
+
+#else
+/* Covers IS_IN (rtld) or non-multiarch build. */
+# define WCSNCMP_BASE wcsncmp
+# define STRNCMP_BASE strncmp
+# define STRCMP_BASE strcmp
+
+# undef STRCMP_ISA
+# define STRCMP_ISA
+#endif
+
+#if IS_IN (rtld) || defined USE_MULTIARCH
+# define ISA_HIDDEN_JUMPTARGET(...) __VA_ARGS__
+#else
+# define ISA_HIDDEN_JUMPTARGET(...) HIDDEN_JUMPTARGET (__VA_ARGS__)
+#endif
+
+/* Get correct symbol for OVERFLOW_STRCMP, STRCMP, and
+ STRCASECMP. */
+#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+
+# if defined USE_AS_WCSCMP || defined USE_AS_WCSNCMP
+# define OVERFLOW_STRCMP_SYM WCSCMP_BASE
+# define STRCMP_SYM WCSNCMP_BASE
+# elif defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define OVERFLOW_STRCMP_SYM STRCMP_NAME (STRCASECMP_BASE, _l)
+# define STRCMP_SYM STRCMP_NAME (STRNCASECMP_BASE, _l)
+# else
+# define OVERFLOW_STRCMP_SYM STRCMP_BASE
+# define STRCMP_SYM STRNCMP_BASE
+# endif
+
+# define STRCASECMP_SYM STRNCASECMP_BASE
+# define OVERFLOW_STRCMP \
+ ISA_HIDDEN_JUMPTARGET (STRCMP_NAME (OVERFLOW_STRCMP_SYM, STRCMP_ISA))
+#else
+# ifdef USE_AS_WCSCMP
+# define STRCMP_SYM WCSCMP_BASE
+# elif defined USE_AS_STRCASECMP_L
+# define STRCMP_SYM STRCMP_NAME (STRCASECMP_BASE, _l)
+# else
+# define STRCMP_SYM STRCMP_BASE
+# endif
+
+# define STRCASECMP_SYM STRCASECMP_BASE
+#endif
+
+#define STRCASECMP_L_NONASCII STRCMP_NAME (STRCASECMP_SYM, _l_nonascii)
+#define STRCASECMP STRCMP_NAME (STRCASECMP_SYM, STRCMP_ISA)
+#define STRCMP STRCMP_NAME (STRCMP_SYM, STRCMP_ISA)
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2.S b/sysdeps/x86_64/multiarch/strcmp-sse2.S
index b8f95e59cf..b1220231ab 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse2.S
@@ -16,13 +16,2141 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#if IS_IN (libc)
+#if IS_IN (libc) || IS_IN (rtld)
+
+# define STRCMP_ISA _sse2
+# include "strcmp-naming.h"
+
# include <sysdep.h>
-# define STRCMP __strcmp_sse2
+# undef UPDATE_STRNCMP_COUNTER
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strcmp)
-#endif
+# ifndef LABEL
+# define LABEL(l) L(l)
+# endif
+
+# ifdef USE_AS_STRNCMP
+/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
+ if the new counter > the old one or is 0. */
+# define UPDATE_STRNCMP_COUNTER \
+ /* calculate left number to compare */ \
+ lea -16(%rcx, %r11), %r9; \
+ cmp %r9, %r11; \
+ jb LABEL(strcmp_exitz); \
+ test %r9, %r9; \
+ je LABEL(strcmp_exitz); \
+ mov %r9, %r11
+
+# elif defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+
+# define UPDATE_STRNCMP_COUNTER
+# elif defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+
+# define UPDATE_STRNCMP_COUNTER \
+ /* calculate left number to compare */ \
+ lea -16(%rcx, %r11), %r9; \
+ cmp %r9, %r11; \
+ jb LABEL(strcmp_exitz); \
+ test %r9, %r9; \
+ je LABEL(strcmp_exitz); \
+ mov %r9, %r11
+# else
+# define UPDATE_STRNCMP_COUNTER
+# endif
+
+ .text
+# ifdef USE_AS_STRCASECMP_L
+# ifndef ENTRY2
+# define ENTRY2(name) ENTRY (name)
+# define END2(name) END (name)
+# endif
+
+ENTRY2 (STRCASECMP)
+ movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
+ mov %fs:(%rax),%RDX_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END2 (STRCASECMP)
+ /* FALLTHROUGH to strcasecmp_l. */
+# elif defined USE_AS_STRNCASECMP_L
+# ifndef ENTRY2
+# define ENTRY2(name) ENTRY (name)
+# define END2(name) END (name)
+# endif
+
+ENTRY2 (STRCASECMP)
+ movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
+ mov %fs:(%rax),%RCX_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END2 (STRCASECMP)
+ /* FALLTHROUGH to strncasecmp_l. */
+# endif
+
+ENTRY (STRCMP)
+# ifdef USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales
+ with encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
+# else
+ mov (%rdx), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
+ jne __strcasecmp_l_nonascii
+# elif defined USE_AS_STRNCASECMP_L
+ /* We have to fall back on the C implementation for locales
+ with encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
+# else
+ mov (%rcx), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
+ jne __strncasecmp_l_nonascii
+# endif
+
+/*
+ * This implementation uses SSE to compare up to 16 bytes at a time.
+ */
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ test %RDX_LP, %RDX_LP
+ je LABEL(strcmp_exitz)
+ cmp $1, %RDX_LP
+ je LABEL(Byte0)
+ mov %RDX_LP, %R11_LP
+# endif
+ mov %esi, %ecx
+ mov %edi, %eax
+/* Use 64bit AND here to avoid long NOP padding. */
+ and $0x3f, %rcx /* rsi alignment in cache line */
+ and $0x3f, %rax /* rdi alignment in cache line */
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 16
+.Llcase_min:
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+.Llcase_max:
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+.Lcase_add:
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+ movdqa .Llcase_min(%rip), %xmm5
+# define LCASE_MIN_reg %xmm5
+ movdqa .Llcase_max(%rip), %xmm6
+# define LCASE_MAX_reg %xmm6
+ movdqa .Lcase_add(%rip), %xmm7
+# define CASE_ADD_reg %xmm7
+# endif
+ cmp $0x30, %ecx
+ ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
+ cmp $0x30, %eax
+ ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */
+ movlpd (%rdi), %xmm1
+ movlpd (%rsi), %xmm2
+ movhpd 8(%rdi), %xmm1
+ movhpd 8(%rsi), %xmm2
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+ movdqa LCASE_MIN_reg, %xmm8; \
+ movdqa LCASE_MIN_reg, %xmm9; \
+ paddb reg1, %xmm8; \
+ paddb reg2, %xmm9; \
+ pcmpgtb LCASE_MAX_reg, %xmm8; \
+ pcmpgtb LCASE_MAX_reg, %xmm9; \
+ pandn CASE_ADD_reg, %xmm8; \
+ pandn CASE_ADD_reg, %xmm9; \
+ paddb %xmm8, reg1; \
+ paddb %xmm9, reg2
+ TOLOWER (%xmm1, %xmm2)
+# else
+# define TOLOWER(reg1, reg2)
+# endif
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
+ jnz LABEL(less16bytes) /* If not, find different value or null char */
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz) /* finish comparision */
+# endif
+ add $16, %rsi /* prepare to search next 16 bytes */
+ add $16, %rdi /* prepare to search next 16 bytes */
+
+ /*
+ * Determine source and destination string offsets from 16-byte alignment.
+ * Use relative offset difference between the two to determine which case
+ * below to use.
+ */
+ .p2align 4
+LABEL(crosscache):
+ and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
+ and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
+ mov $0xffff, %edx /* for equivalent offset */
+ xor %r8d, %r8d
+ and $0xf, %ecx /* offset of rsi */
+ and $0xf, %eax /* offset of rdi */
+ cmp %eax, %ecx
+ je LABEL(ashr_0) /* rsi and rdi relative offset same */
+ ja LABEL(bigger)
+ mov %edx, %r8d /* r8d is offset flag for exit tail */
+ xchg %ecx, %eax
+ xchg %rsi, %rdi
+LABEL(bigger):
+ lea 15(%rax), %r9
+ sub %rcx, %r9
+ lea LABEL(unaligned_table)(%rip), %r10
+ movslq (%r10, %r9,4), %r9
+ lea (%r10, %r9), %r10
+ _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
+
+/*
+ * The following cases will be handled by ashr_0
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(0~15) n(0~15) 15(15+ n-n) ashr_0
+ */
+ .p2align 4
+LABEL(ashr_0):
+
+ movdqa (%rsi), %xmm1
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
+# else
+ movdqa (%rdi), %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
+# endif
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %r9d
+ shr %cl, %edx /* adjust 0xffff for offset */
+ shr %cl, %r9d /* adjust for 16-byte offset */
+ sub %r9d, %edx
+ /*
+ * edx must be the same with r9d if in left byte (16-rcx) is equal to
+ * the start from (16-rax) and no null char was seen.
+ */
+ jne LABEL(less32bytes) /* mismatch or null char */
+ UPDATE_STRNCMP_COUNTER
+ mov $16, %rcx
+ mov $16, %r9
+ pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
+
+ /*
+ * Now both strings are aligned at 16-byte boundary. Loop over strings
+ * checking 32-bytes per iteration.
+ */
+ .p2align 4
+LABEL(loop_ashr_0):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit) /* mismatch or null char seen */
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rcx
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rcx
+ jmp LABEL(loop_ashr_0)
+
+/*
+ * The following cases will be handled by ashr_1
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(15) n -15 0(15 +(n-15) - n) ashr_1
+ */
+ .p2align 4
+LABEL(ashr_1):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pslldq $15, %xmm2 /* shift first string to align with second */
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx /* adjust 0xffff for offset */
+ shr %cl, %r9d /* adjust for 16-byte offset */
+ sub %r9d, %edx
+ jnz LABEL(less32bytes) /* mismatch or null char seen */
+ movdqa (%rdi), %xmm3
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads*/
+ mov $1, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 1(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_1):
+ add $16, %r10
+ jg LABEL(nibble_ashr_1) /* cross page boundary */
+
+LABEL(gobble_ashr_1):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4 /* store for next cycle */
+
+ psrldq $1, %xmm3
+ pslldq $15, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_1) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4 /* store for next cycle */
+
+ psrldq $1, %xmm3
+ pslldq $15, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_1)
+
+ /*
+ * Nibble avoids loads across page boundary. This is to avoid a potential
+ * access into unmapped memory.
+ */
+ .p2align 4
+LABEL(nibble_ashr_1):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/
+ pmovmskb %xmm0, %edx
+ test $0xfffe, %edx
+ jnz LABEL(ashr_1_exittail) /* find null char*/
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $15, %r11
+ jbe LABEL(ashr_1_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10 /* substract 4K from %r10 */
+ jmp LABEL(gobble_ashr_1)
+
+ /*
+ * Once find null char, determine if there is a string mismatch
+ * before the null char.
+ */
+ .p2align 4
+LABEL(ashr_1_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $1, %xmm0
+ psrldq $1, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_2
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
+ */
+ .p2align 4
+LABEL(ashr_2):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $14, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $2, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 2(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_2):
+ add $16, %r10
+ jg LABEL(nibble_ashr_2)
+
+LABEL(gobble_ashr_2):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $2, %xmm3
+ pslldq $14, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_2) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $2, %xmm3
+ pslldq $14, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_2)
+
+ .p2align 4
+LABEL(nibble_ashr_2):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfffc, %edx
+ jnz LABEL(ashr_2_exittail)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $14, %r11
+ jbe LABEL(ashr_2_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_2)
+
+ .p2align 4
+LABEL(ashr_2_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $2, %xmm0
+ psrldq $2, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_3
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
+ */
+ .p2align 4
+LABEL(ashr_3):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $13, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $3, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 3(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_3):
+ add $16, %r10
+ jg LABEL(nibble_ashr_3)
+
+LABEL(gobble_ashr_3):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $3, %xmm3
+ pslldq $13, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_3) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $3, %xmm3
+ pslldq $13, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_3)
+
+ .p2align 4
+LABEL(nibble_ashr_3):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfff8, %edx
+ jnz LABEL(ashr_3_exittail)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $13, %r11
+ jbe LABEL(ashr_3_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_3)
+
+ .p2align 4
+LABEL(ashr_3_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $3, %xmm0
+ psrldq $3, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_4
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
+ */
+ .p2align 4
+LABEL(ashr_4):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $12, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $4, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 4(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_4):
+ add $16, %r10
+ jg LABEL(nibble_ashr_4)
+
+LABEL(gobble_ashr_4):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $4, %xmm3
+ pslldq $12, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_4) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $4, %xmm3
+ pslldq $12, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_4)
+
+ .p2align 4
+LABEL(nibble_ashr_4):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfff0, %edx
+ jnz LABEL(ashr_4_exittail)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $12, %r11
+ jbe LABEL(ashr_4_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_4)
+
+ .p2align 4
+LABEL(ashr_4_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $4, %xmm0
+ psrldq $4, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_5
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
+ */
+ .p2align 4
+LABEL(ashr_5):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $11, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $5, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 5(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_5):
+ add $16, %r10
+ jg LABEL(nibble_ashr_5)
+
+LABEL(gobble_ashr_5):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $5, %xmm3
+ pslldq $11, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_5) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $5, %xmm3
+ pslldq $11, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_5)
+
+ .p2align 4
+LABEL(nibble_ashr_5):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xffe0, %edx
+ jnz LABEL(ashr_5_exittail)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $11, %r11
+ jbe LABEL(ashr_5_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_5)
+
+ .p2align 4
+LABEL(ashr_5_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $5, %xmm0
+ psrldq $5, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_6
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
+ */
+ .p2align 4
+LABEL(ashr_6):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $10, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $6, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 6(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_6):
+ add $16, %r10
+ jg LABEL(nibble_ashr_6)
+
+LABEL(gobble_ashr_6):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $6, %xmm3
+ pslldq $10, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_6) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $6, %xmm3
+ pslldq $10, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_6)
+
+ .p2align 4
+LABEL(nibble_ashr_6):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xffc0, %edx
+ jnz LABEL(ashr_6_exittail)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $10, %r11
+ jbe LABEL(ashr_6_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_6)
+
+ .p2align 4
+LABEL(ashr_6_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $6, %xmm0
+ psrldq $6, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_7
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
+ */
+ .p2align 4
+LABEL(ashr_7):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $9, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $7, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 7(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_7):
+ add $16, %r10
+ jg LABEL(nibble_ashr_7)
+
+LABEL(gobble_ashr_7):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $7, %xmm3
+ pslldq $9, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_7) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $7, %xmm3
+ pslldq $9, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_7)
+
+ .p2align 4
+LABEL(nibble_ashr_7):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xff80, %edx
+ jnz LABEL(ashr_7_exittail)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $9, %r11
+ jbe LABEL(ashr_7_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_7)
+
+ .p2align 4
+LABEL(ashr_7_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $7, %xmm0
+ psrldq $7, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_8
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
+ */
+ .p2align 4
+LABEL(ashr_8):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $8, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $8, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 8(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_8):
+ add $16, %r10
+ jg LABEL(nibble_ashr_8)
+
+LABEL(gobble_ashr_8):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $8, %xmm3
+ pslldq $8, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
-#include <sysdeps/x86_64/strcmp.S>
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_8) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $8, %xmm3
+ pslldq $8, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_8)
+
+ .p2align 4
+LABEL(nibble_ashr_8):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xff00, %edx
+ jnz LABEL(ashr_8_exittail)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $8, %r11
+ jbe LABEL(ashr_8_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_8)
+
+ .p2align 4
+LABEL(ashr_8_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $8, %xmm0
+ psrldq $8, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_9
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
+ */
+ .p2align 4
+LABEL(ashr_9):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $7, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $9, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 9(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_9):
+ add $16, %r10
+ jg LABEL(nibble_ashr_9)
+
+LABEL(gobble_ashr_9):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $9, %xmm3
+ pslldq $7, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_9) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $9, %xmm3
+ pslldq $7, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3 /* store for next cycle */
+ jmp LABEL(loop_ashr_9)
+
+ .p2align 4
+LABEL(nibble_ashr_9):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfe00, %edx
+ jnz LABEL(ashr_9_exittail)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $7, %r11
+ jbe LABEL(ashr_9_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_9)
+
+ .p2align 4
+LABEL(ashr_9_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $9, %xmm0
+ psrldq $9, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_10
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
+ */
+ .p2align 4
+LABEL(ashr_10):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $6, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $10, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 10(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_10):
+ add $16, %r10
+ jg LABEL(nibble_ashr_10)
+
+LABEL(gobble_ashr_10):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $10, %xmm3
+ pslldq $6, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_10) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $10, %xmm3
+ pslldq $6, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_10)
+
+ .p2align 4
+LABEL(nibble_ashr_10):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xfc00, %edx
+ jnz LABEL(ashr_10_exittail)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $6, %r11
+ jbe LABEL(ashr_10_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_10)
+
+ .p2align 4
+LABEL(ashr_10_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $10, %xmm0
+ psrldq $10, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_11
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
+ */
+ .p2align 4
+LABEL(ashr_11):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $5, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $11, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 11(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_11):
+ add $16, %r10
+ jg LABEL(nibble_ashr_11)
+
+LABEL(gobble_ashr_11):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $11, %xmm3
+ pslldq $5, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_11) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $11, %xmm3
+ pslldq $5, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_11)
+
+ .p2align 4
+LABEL(nibble_ashr_11):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xf800, %edx
+ jnz LABEL(ashr_11_exittail)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $5, %r11
+ jbe LABEL(ashr_11_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_11)
+
+ .p2align 4
+LABEL(ashr_11_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $11, %xmm0
+ psrldq $11, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_12
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
+ */
+ .p2align 4
+LABEL(ashr_12):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $4, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $12, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 12(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_12):
+ add $16, %r10
+ jg LABEL(nibble_ashr_12)
+
+LABEL(gobble_ashr_12):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $12, %xmm3
+ pslldq $4, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_12) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $12, %xmm3
+ pslldq $4, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_12)
+
+ .p2align 4
+LABEL(nibble_ashr_12):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xf000, %edx
+ jnz LABEL(ashr_12_exittail)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $4, %r11
+ jbe LABEL(ashr_12_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_12)
+
+ .p2align 4
+LABEL(ashr_12_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $12, %xmm0
+ psrldq $12, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_13
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
+ */
+ .p2align 4
+LABEL(ashr_13):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $13, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 13(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_13):
+ add $16, %r10
+ jg LABEL(nibble_ashr_13)
+
+LABEL(gobble_ashr_13):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $13, %xmm3
+ pslldq $3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_13) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $13, %xmm3
+ pslldq $3, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_13)
+
+ .p2align 4
+LABEL(nibble_ashr_13):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xe000, %edx
+ jnz LABEL(ashr_13_exittail)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $3, %r11
+ jbe LABEL(ashr_13_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_13)
+
+ .p2align 4
+LABEL(ashr_13_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $13, %xmm0
+ psrldq $13, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_14
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
+ */
+ .p2align 4
+LABEL(ashr_14):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $2, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $14, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 14(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_14):
+ add $16, %r10
+ jg LABEL(nibble_ashr_14)
+
+LABEL(gobble_ashr_14):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $14, %xmm3
+ pslldq $2, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_14) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $14, %xmm3
+ pslldq $2, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_14)
+
+ .p2align 4
+LABEL(nibble_ashr_14):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0xc000, %edx
+ jnz LABEL(ashr_14_exittail)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp $2, %r11
+ jbe LABEL(ashr_14_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_14)
+
+ .p2align 4
+LABEL(ashr_14_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $14, %xmm0
+ psrldq $14, %xmm3
+ jmp LABEL(aftertail)
+
+/*
+ * The following cases will be handled by ashr_15
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
+ */
+ .p2align 4
+LABEL(ashr_15):
+ pxor %xmm0, %xmm0
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0
+ pslldq $1, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ pxor %xmm0, %xmm0
+ mov $16, %rcx /* index for loads */
+ mov $15, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 15(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ .p2align 4
+LABEL(loop_ashr_15):
+ add $16, %r10
+ jg LABEL(nibble_ashr_15)
+
+LABEL(gobble_ashr_15):
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $15, %xmm3
+ pslldq $1, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+
+ add $16, %r10
+ jg LABEL(nibble_ashr_15) /* cross page boundary */
+
+ movdqa (%rsi, %rcx), %xmm1
+ movdqa (%rdi, %rcx), %xmm2
+ movdqa %xmm2, %xmm4
+
+ psrldq $15, %xmm3
+ pslldq $1, %xmm2
+ por %xmm3, %xmm2 /* merge into one 16byte value */
+
+ TOLOWER (%xmm1, %xmm2)
+
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm2, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx
+ jnz LABEL(exit)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rcx
+ movdqa %xmm4, %xmm3
+ jmp LABEL(loop_ashr_15)
+
+ .p2align 4
+LABEL(nibble_ashr_15):
+ pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
+ pmovmskb %xmm0, %edx
+ test $0x8000, %edx
+ jnz LABEL(ashr_15_exittail)
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmpq $1, %r11
+ jbe LABEL(ashr_15_exittail)
+# endif
+
+ pxor %xmm0, %xmm0
+ sub $0x1000, %r10
+ jmp LABEL(gobble_ashr_15)
+
+ .p2align 4
+LABEL(ashr_15_exittail):
+ movdqa (%rsi, %rcx), %xmm1
+ psrldq $15, %xmm3
+ psrldq $15, %xmm0
+
+ .p2align 4
+LABEL(aftertail):
+ TOLOWER (%xmm1, %xmm3)
+ pcmpeqb %xmm3, %xmm1
+ psubb %xmm0, %xmm1
+ pmovmskb %xmm1, %edx
+ not %edx
+
+ .p2align 4
+LABEL(exit):
+ lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
+LABEL(less32bytes):
+ lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
+ lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
+ test %r8d, %r8d
+ jz LABEL(ret)
+ xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
+
+ .p2align 4
+LABEL(ret):
+LABEL(less16bytes):
+ bsf %rdx, %rdx /* find and store bit index in %rdx */
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub %rdx, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ movzbl (%rsi, %rdx), %ecx
+ movzbl (%rdi, %rdx), %eax
+
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+ movl (%rdx,%rcx,4), %ecx
+ movl (%rdx,%rax,4), %eax
+# endif
+
+ sub %ecx, %eax
+ ret
+
+LABEL(strcmp_exitz):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+LABEL(Byte0):
+ movzbl (%rsi), %ecx
+ movzbl (%rdi), %eax
+
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+ movl (%rdx,%rcx,4), %ecx
+ movl (%rdx,%rax,4), %eax
+# endif
+
+ sub %ecx, %eax
+ ret
+END (STRCMP)
+
+ .section .rodata,"a",@progbits
+ .p2align 3
+LABEL(unaligned_table):
+ .int LABEL(ashr_1) - LABEL(unaligned_table)
+ .int LABEL(ashr_2) - LABEL(unaligned_table)
+ .int LABEL(ashr_3) - LABEL(unaligned_table)
+ .int LABEL(ashr_4) - LABEL(unaligned_table)
+ .int LABEL(ashr_5) - LABEL(unaligned_table)
+ .int LABEL(ashr_6) - LABEL(unaligned_table)
+ .int LABEL(ashr_7) - LABEL(unaligned_table)
+ .int LABEL(ashr_8) - LABEL(unaligned_table)
+ .int LABEL(ashr_9) - LABEL(unaligned_table)
+ .int LABEL(ashr_10) - LABEL(unaligned_table)
+ .int LABEL(ashr_11) - LABEL(unaligned_table)
+ .int LABEL(ashr_12) - LABEL(unaligned_table)
+ .int LABEL(ashr_13) - LABEL(unaligned_table)
+ .int LABEL(ashr_14) - LABEL(unaligned_table)
+ .int LABEL(ashr_15) - LABEL(unaligned_table)
+ .int LABEL(ashr_0) - LABEL(unaligned_table)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-sse2.S b/sysdeps/x86_64/multiarch/strncase_l-sse2.S
index 0ca4c836b2..fd8ad07450 100644
--- a/sysdeps/x86_64/multiarch/strncase_l-sse2.S
+++ b/sysdeps/x86_64/multiarch/strncase_l-sse2.S
@@ -16,8 +16,5 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#define STRCMP __strncasecmp_l_sse2
-#define NO_NOLOCALE_ALIAS
#define USE_AS_STRNCASECMP_L
-#define __strncasecmp __strncasecmp_sse2
-#include <sysdeps/x86_64/strcmp.S>
+#include "strcmp-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-sse2.S b/sysdeps/x86_64/multiarch/strncmp-sse2.S
index e3ba94f926..2152b8dc3d 100644
--- a/sysdeps/x86_64/multiarch/strncmp-sse2.S
+++ b/sysdeps/x86_64/multiarch/strncmp-sse2.S
@@ -16,15 +16,5 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
-
-#if IS_IN (libc)
-# define STRCMP __strncmp_sse2
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strcmp)
-#else
-# define STRCMP strncmp
-#endif
-
#define USE_AS_STRNCMP
-#include <sysdeps/x86_64/strcmp.S>
+#include "strcmp-sse2.S"
diff --git a/sysdeps/x86_64/strcasecmp_l.S b/sysdeps/x86_64/strcasecmp_l.S
index 5456b3a49e..84fd7fdfd3 100644
--- a/sysdeps/x86_64/strcasecmp_l.S
+++ b/sysdeps/x86_64/strcasecmp_l.S
@@ -1,6 +1,11 @@
-#define STRCMP __strcasecmp_l
-#define USE_AS_STRCASECMP_L
-#include "strcmp.S"
+/* Symbols = __strcasecmp_l and __strcasecmp. */
+
+#include "multiarch/strcasecmp_l-sse2.S"
+
+libc_hidden_builtin_def (__strcasecmp_l)
weak_alias (__strcasecmp_l, strcasecmp_l)
libc_hidden_def (strcasecmp_l)
+
+weak_alias (__strcasecmp, strcasecmp)
+libc_hidden_def (__strcasecmp)
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index c38dc627f9..19e54bd3a7 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -16,2148 +16,7 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
-#include "asm-syntax.h"
+/* Symbol = strcmp. */
-#undef UPDATE_STRNCMP_COUNTER
-
-#ifndef LABEL
-#define LABEL(l) L(l)
-#endif
-
-#ifdef USE_AS_STRNCMP
-/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
- if the new counter > the old one or is 0. */
-# define UPDATE_STRNCMP_COUNTER \
- /* calculate left number to compare */ \
- lea -16(%rcx, %r11), %r9; \
- cmp %r9, %r11; \
- jb LABEL(strcmp_exitz); \
- test %r9, %r9; \
- je LABEL(strcmp_exitz); \
- mov %r9, %r11
-
-#elif defined USE_AS_STRCASECMP_L
-# include "locale-defines.h"
-
-# define UPDATE_STRNCMP_COUNTER
-#elif defined USE_AS_STRNCASECMP_L
-# include "locale-defines.h"
-
-# define UPDATE_STRNCMP_COUNTER \
- /* calculate left number to compare */ \
- lea -16(%rcx, %r11), %r9; \
- cmp %r9, %r11; \
- jb LABEL(strcmp_exitz); \
- test %r9, %r9; \
- je LABEL(strcmp_exitz); \
- mov %r9, %r11
-#else
-# define UPDATE_STRNCMP_COUNTER
-# ifndef STRCMP
-# define STRCMP strcmp
-# endif
-#endif
-
- .text
-#ifdef USE_AS_STRCASECMP_L
-# ifndef ENTRY2
-# define ENTRY2(name) ENTRY (name)
-# define END2(name) END (name)
-# endif
-
-ENTRY2 (__strcasecmp)
- movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
- mov %fs:(%rax),%RDX_LP
-
- /* Either 1 or 5 bytes (dependeing if CET is enabled). */
- .p2align 4
-END2 (__strcasecmp)
-# ifndef NO_NOLOCALE_ALIAS
-weak_alias (__strcasecmp, strcasecmp)
-libc_hidden_def (__strcasecmp)
-# endif
- /* FALLTHROUGH to strcasecmp_l. */
-#elif defined USE_AS_STRNCASECMP_L
-# ifndef ENTRY2
-# define ENTRY2(name) ENTRY (name)
-# define END2(name) END (name)
-# endif
-
-ENTRY2 (__strncasecmp)
- movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
- mov %fs:(%rax),%RCX_LP
-
- /* Either 1 or 5 bytes (dependeing if CET is enabled). */
- .p2align 4
-END2 (__strncasecmp)
-# ifndef NO_NOLOCALE_ALIAS
-weak_alias (__strncasecmp, strncasecmp)
-libc_hidden_def (__strncasecmp)
-# endif
- /* FALLTHROUGH to strncasecmp_l. */
-#endif
-
-ENTRY (STRCMP)
-#ifdef USE_AS_STRCASECMP_L
- /* We have to fall back on the C implementation for locales
- with encodings not matching ASCII for single bytes. */
-# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
- mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
-# else
- mov (%rdx), %RAX_LP
-# endif
- testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
- jne __strcasecmp_l_nonascii
-#elif defined USE_AS_STRNCASECMP_L
- /* We have to fall back on the C implementation for locales
- with encodings not matching ASCII for single bytes. */
-# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
- mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
-# else
- mov (%rcx), %RAX_LP
-# endif
- testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
- jne __strncasecmp_l_nonascii
-#endif
-
-/*
- * This implementation uses SSE to compare up to 16 bytes at a time.
- */
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- test %RDX_LP, %RDX_LP
- je LABEL(strcmp_exitz)
- cmp $1, %RDX_LP
- je LABEL(Byte0)
- mov %RDX_LP, %R11_LP
-#endif
- mov %esi, %ecx
- mov %edi, %eax
-/* Use 64bit AND here to avoid long NOP padding. */
- and $0x3f, %rcx /* rsi alignment in cache line */
- and $0x3f, %rax /* rdi alignment in cache line */
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- .section .rodata.cst16,"aM",@progbits,16
- .align 16
-.Llcase_min:
- .quad 0x3f3f3f3f3f3f3f3f
- .quad 0x3f3f3f3f3f3f3f3f
-.Llcase_max:
- .quad 0x9999999999999999
- .quad 0x9999999999999999
-.Lcase_add:
- .quad 0x2020202020202020
- .quad 0x2020202020202020
- .previous
- movdqa .Llcase_min(%rip), %xmm5
-# define LCASE_MIN_reg %xmm5
- movdqa .Llcase_max(%rip), %xmm6
-# define LCASE_MAX_reg %xmm6
- movdqa .Lcase_add(%rip), %xmm7
-# define CASE_ADD_reg %xmm7
-#endif
- cmp $0x30, %ecx
- ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
- cmp $0x30, %eax
- ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */
- movlpd (%rdi), %xmm1
- movlpd (%rsi), %xmm2
- movhpd 8(%rdi), %xmm1
- movhpd 8(%rsi), %xmm2
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# define TOLOWER(reg1, reg2) \
- movdqa LCASE_MIN_reg, %xmm8; \
- movdqa LCASE_MIN_reg, %xmm9; \
- paddb reg1, %xmm8; \
- paddb reg2, %xmm9; \
- pcmpgtb LCASE_MAX_reg, %xmm8; \
- pcmpgtb LCASE_MAX_reg, %xmm9; \
- pandn CASE_ADD_reg, %xmm8; \
- pandn CASE_ADD_reg, %xmm9; \
- paddb %xmm8, reg1; \
- paddb %xmm9, reg2
- TOLOWER (%xmm1, %xmm2)
-#else
-# define TOLOWER(reg1, reg2)
-#endif
- pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
- pcmpeqb %xmm1, %xmm0 /* Any null chars? */
- pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
- jnz LABEL(less16bytes) /* If not, find different value or null char */
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz) /* finish comparision */
-#endif
- add $16, %rsi /* prepare to search next 16 bytes */
- add $16, %rdi /* prepare to search next 16 bytes */
-
- /*
- * Determine source and destination string offsets from 16-byte alignment.
- * Use relative offset difference between the two to determine which case
- * below to use.
- */
- .p2align 4
-LABEL(crosscache):
- and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
- and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
- mov $0xffff, %edx /* for equivalent offset */
- xor %r8d, %r8d
- and $0xf, %ecx /* offset of rsi */
- and $0xf, %eax /* offset of rdi */
- cmp %eax, %ecx
- je LABEL(ashr_0) /* rsi and rdi relative offset same */
- ja LABEL(bigger)
- mov %edx, %r8d /* r8d is offset flag for exit tail */
- xchg %ecx, %eax
- xchg %rsi, %rdi
-LABEL(bigger):
- lea 15(%rax), %r9
- sub %rcx, %r9
- lea LABEL(unaligned_table)(%rip), %r10
- movslq (%r10, %r9,4), %r9
- lea (%r10, %r9), %r10
- _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
-
-/*
- * The following cases will be handled by ashr_0
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(0~15) n(0~15) 15(15+ n-n) ashr_0
- */
- .p2align 4
-LABEL(ashr_0):
-
- movdqa (%rsi), %xmm1
- pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
- pcmpeqb %xmm1, %xmm0 /* Any null chars? */
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
-#else
- movdqa (%rdi), %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
-#endif
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %r9d
- shr %cl, %edx /* adjust 0xffff for offset */
- shr %cl, %r9d /* adjust for 16-byte offset */
- sub %r9d, %edx
- /*
- * edx must be the same with r9d if in left byte (16-rcx) is equal to
- * the start from (16-rax) and no null char was seen.
- */
- jne LABEL(less32bytes) /* mismatch or null char */
- UPDATE_STRNCMP_COUNTER
- mov $16, %rcx
- mov $16, %r9
- pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
-
- /*
- * Now both strings are aligned at 16-byte boundary. Loop over strings
- * checking 32-bytes per iteration.
- */
- .p2align 4
-LABEL(loop_ashr_0):
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit) /* mismatch or null char seen */
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rcx
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rcx
- jmp LABEL(loop_ashr_0)
-
-/*
- * The following cases will be handled by ashr_1
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(15) n -15 0(15 +(n-15) - n) ashr_1
- */
- .p2align 4
-LABEL(ashr_1):
- pxor %xmm0, %xmm0
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, %xmm0 /* Any null chars? */
- pslldq $15, %xmm2 /* shift first string to align with second */
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
- psubb %xmm0, %xmm2 /* packed sub of comparison results*/
- pmovmskb %xmm2, %r9d
- shr %cl, %edx /* adjust 0xffff for offset */
- shr %cl, %r9d /* adjust for 16-byte offset */
- sub %r9d, %edx
- jnz LABEL(less32bytes) /* mismatch or null char seen */
- movdqa (%rdi), %xmm3
- UPDATE_STRNCMP_COUNTER
-
- pxor %xmm0, %xmm0
- mov $16, %rcx /* index for loads*/
- mov $1, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 1(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- .p2align 4
-LABEL(loop_ashr_1):
- add $16, %r10
- jg LABEL(nibble_ashr_1) /* cross page boundary */
-
-LABEL(gobble_ashr_1):
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4 /* store for next cycle */
-
- psrldq $1, %xmm3
- pslldq $15, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rcx
- movdqa %xmm4, %xmm3
-
- add $16, %r10
- jg LABEL(nibble_ashr_1) /* cross page boundary */
-
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4 /* store for next cycle */
-
- psrldq $1, %xmm3
- pslldq $15, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rcx
- movdqa %xmm4, %xmm3
- jmp LABEL(loop_ashr_1)
-
- /*
- * Nibble avoids loads across page boundary. This is to avoid a potential
- * access into unmapped memory.
- */
- .p2align 4
-LABEL(nibble_ashr_1):
- pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/
- pmovmskb %xmm0, %edx
- test $0xfffe, %edx
- jnz LABEL(ashr_1_exittail) /* find null char*/
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp $15, %r11
- jbe LABEL(ashr_1_exittail)
-#endif
-
- pxor %xmm0, %xmm0
- sub $0x1000, %r10 /* substract 4K from %r10 */
- jmp LABEL(gobble_ashr_1)
-
- /*
- * Once find null char, determine if there is a string mismatch
- * before the null char.
- */
- .p2align 4
-LABEL(ashr_1_exittail):
- movdqa (%rsi, %rcx), %xmm1
- psrldq $1, %xmm0
- psrldq $1, %xmm3
- jmp LABEL(aftertail)
-
-/*
- * The following cases will be handled by ashr_2
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
- */
- .p2align 4
-LABEL(ashr_2):
- pxor %xmm0, %xmm0
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pslldq $14, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
- UPDATE_STRNCMP_COUNTER
-
- pxor %xmm0, %xmm0
- mov $16, %rcx /* index for loads */
- mov $2, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 2(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- .p2align 4
-LABEL(loop_ashr_2):
- add $16, %r10
- jg LABEL(nibble_ashr_2)
-
-LABEL(gobble_ashr_2):
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $2, %xmm3
- pslldq $14, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
-
- add $16, %r10
- jg LABEL(nibble_ashr_2) /* cross page boundary */
-
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $2, %xmm3
- pslldq $14, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
- jmp LABEL(loop_ashr_2)
-
- .p2align 4
-LABEL(nibble_ashr_2):
- pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
- pmovmskb %xmm0, %edx
- test $0xfffc, %edx
- jnz LABEL(ashr_2_exittail)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp $14, %r11
- jbe LABEL(ashr_2_exittail)
-#endif
-
- pxor %xmm0, %xmm0
- sub $0x1000, %r10
- jmp LABEL(gobble_ashr_2)
-
- .p2align 4
-LABEL(ashr_2_exittail):
- movdqa (%rsi, %rcx), %xmm1
- psrldq $2, %xmm0
- psrldq $2, %xmm3
- jmp LABEL(aftertail)
-
-/*
- * The following cases will be handled by ashr_3
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
- */
- .p2align 4
-LABEL(ashr_3):
- pxor %xmm0, %xmm0
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pslldq $13, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- pxor %xmm0, %xmm0
- mov $16, %rcx /* index for loads */
- mov $3, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 3(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- .p2align 4
-LABEL(loop_ashr_3):
- add $16, %r10
- jg LABEL(nibble_ashr_3)
-
-LABEL(gobble_ashr_3):
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $3, %xmm3
- pslldq $13, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
-
- add $16, %r10
- jg LABEL(nibble_ashr_3) /* cross page boundary */
-
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $3, %xmm3
- pslldq $13, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
- jmp LABEL(loop_ashr_3)
-
- .p2align 4
-LABEL(nibble_ashr_3):
- pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
- pmovmskb %xmm0, %edx
- test $0xfff8, %edx
- jnz LABEL(ashr_3_exittail)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp $13, %r11
- jbe LABEL(ashr_3_exittail)
-#endif
-
- pxor %xmm0, %xmm0
- sub $0x1000, %r10
- jmp LABEL(gobble_ashr_3)
-
- .p2align 4
-LABEL(ashr_3_exittail):
- movdqa (%rsi, %rcx), %xmm1
- psrldq $3, %xmm0
- psrldq $3, %xmm3
- jmp LABEL(aftertail)
-
-/*
- * The following cases will be handled by ashr_4
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
- */
- .p2align 4
-LABEL(ashr_4):
- pxor %xmm0, %xmm0
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pslldq $12, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- pxor %xmm0, %xmm0
- mov $16, %rcx /* index for loads */
- mov $4, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 4(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- .p2align 4
-LABEL(loop_ashr_4):
- add $16, %r10
- jg LABEL(nibble_ashr_4)
-
-LABEL(gobble_ashr_4):
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $4, %xmm3
- pslldq $12, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
-
- add $16, %r10
- jg LABEL(nibble_ashr_4) /* cross page boundary */
-
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $4, %xmm3
- pslldq $12, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
- jmp LABEL(loop_ashr_4)
-
- .p2align 4
-LABEL(nibble_ashr_4):
- pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
- pmovmskb %xmm0, %edx
- test $0xfff0, %edx
- jnz LABEL(ashr_4_exittail)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp $12, %r11
- jbe LABEL(ashr_4_exittail)
-#endif
-
- pxor %xmm0, %xmm0
- sub $0x1000, %r10
- jmp LABEL(gobble_ashr_4)
-
- .p2align 4
-LABEL(ashr_4_exittail):
- movdqa (%rsi, %rcx), %xmm1
- psrldq $4, %xmm0
- psrldq $4, %xmm3
- jmp LABEL(aftertail)
-
-/*
- * The following cases will be handled by ashr_5
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
- */
- .p2align 4
-LABEL(ashr_5):
- pxor %xmm0, %xmm0
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pslldq $11, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- pxor %xmm0, %xmm0
- mov $16, %rcx /* index for loads */
- mov $5, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 5(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- .p2align 4
-LABEL(loop_ashr_5):
- add $16, %r10
- jg LABEL(nibble_ashr_5)
-
-LABEL(gobble_ashr_5):
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $5, %xmm3
- pslldq $11, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
-
- add $16, %r10
- jg LABEL(nibble_ashr_5) /* cross page boundary */
-
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $5, %xmm3
- pslldq $11, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
- jmp LABEL(loop_ashr_5)
-
- .p2align 4
-LABEL(nibble_ashr_5):
- pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
- pmovmskb %xmm0, %edx
- test $0xffe0, %edx
- jnz LABEL(ashr_5_exittail)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp $11, %r11
- jbe LABEL(ashr_5_exittail)
-#endif
-
- pxor %xmm0, %xmm0
- sub $0x1000, %r10
- jmp LABEL(gobble_ashr_5)
-
- .p2align 4
-LABEL(ashr_5_exittail):
- movdqa (%rsi, %rcx), %xmm1
- psrldq $5, %xmm0
- psrldq $5, %xmm3
- jmp LABEL(aftertail)
-
-/*
- * The following cases will be handled by ashr_6
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
- */
- .p2align 4
-LABEL(ashr_6):
- pxor %xmm0, %xmm0
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pslldq $10, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- pxor %xmm0, %xmm0
- mov $16, %rcx /* index for loads */
- mov $6, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 6(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- .p2align 4
-LABEL(loop_ashr_6):
- add $16, %r10
- jg LABEL(nibble_ashr_6)
-
-LABEL(gobble_ashr_6):
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $6, %xmm3
- pslldq $10, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
-
- add $16, %r10
- jg LABEL(nibble_ashr_6) /* cross page boundary */
-
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $6, %xmm3
- pslldq $10, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
- jmp LABEL(loop_ashr_6)
-
- .p2align 4
-LABEL(nibble_ashr_6):
- pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
- pmovmskb %xmm0, %edx
- test $0xffc0, %edx
- jnz LABEL(ashr_6_exittail)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp $10, %r11
- jbe LABEL(ashr_6_exittail)
-#endif
-
- pxor %xmm0, %xmm0
- sub $0x1000, %r10
- jmp LABEL(gobble_ashr_6)
-
- .p2align 4
-LABEL(ashr_6_exittail):
- movdqa (%rsi, %rcx), %xmm1
- psrldq $6, %xmm0
- psrldq $6, %xmm3
- jmp LABEL(aftertail)
-
-/*
- * The following cases will be handled by ashr_7
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
- */
- .p2align 4
-LABEL(ashr_7):
- pxor %xmm0, %xmm0
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pslldq $9, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- pxor %xmm0, %xmm0
- mov $16, %rcx /* index for loads */
- mov $7, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 7(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- .p2align 4
-LABEL(loop_ashr_7):
- add $16, %r10
- jg LABEL(nibble_ashr_7)
-
-LABEL(gobble_ashr_7):
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $7, %xmm3
- pslldq $9, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
-
- add $16, %r10
- jg LABEL(nibble_ashr_7) /* cross page boundary */
-
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $7, %xmm3
- pslldq $9, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
- jmp LABEL(loop_ashr_7)
-
- .p2align 4
-LABEL(nibble_ashr_7):
- pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
- pmovmskb %xmm0, %edx
- test $0xff80, %edx
- jnz LABEL(ashr_7_exittail)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp $9, %r11
- jbe LABEL(ashr_7_exittail)
-#endif
-
- pxor %xmm0, %xmm0
- sub $0x1000, %r10
- jmp LABEL(gobble_ashr_7)
-
- .p2align 4
-LABEL(ashr_7_exittail):
- movdqa (%rsi, %rcx), %xmm1
- psrldq $7, %xmm0
- psrldq $7, %xmm3
- jmp LABEL(aftertail)
-
-/*
- * The following cases will be handled by ashr_8
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
- */
- .p2align 4
-LABEL(ashr_8):
- pxor %xmm0, %xmm0
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pslldq $8, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- pxor %xmm0, %xmm0
- mov $16, %rcx /* index for loads */
- mov $8, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 8(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- .p2align 4
-LABEL(loop_ashr_8):
- add $16, %r10
- jg LABEL(nibble_ashr_8)
-
-LABEL(gobble_ashr_8):
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $8, %xmm3
- pslldq $8, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
-
- add $16, %r10
- jg LABEL(nibble_ashr_8) /* cross page boundary */
-
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $8, %xmm3
- pslldq $8, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
- jmp LABEL(loop_ashr_8)
-
- .p2align 4
-LABEL(nibble_ashr_8):
- pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
- pmovmskb %xmm0, %edx
- test $0xff00, %edx
- jnz LABEL(ashr_8_exittail)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp $8, %r11
- jbe LABEL(ashr_8_exittail)
-#endif
-
- pxor %xmm0, %xmm0
- sub $0x1000, %r10
- jmp LABEL(gobble_ashr_8)
-
- .p2align 4
-LABEL(ashr_8_exittail):
- movdqa (%rsi, %rcx), %xmm1
- psrldq $8, %xmm0
- psrldq $8, %xmm3
- jmp LABEL(aftertail)
-
-/*
- * The following cases will be handled by ashr_9
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
- */
- .p2align 4
-LABEL(ashr_9):
- pxor %xmm0, %xmm0
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pslldq $7, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- pxor %xmm0, %xmm0
- mov $16, %rcx /* index for loads */
- mov $9, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 9(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- .p2align 4
-LABEL(loop_ashr_9):
- add $16, %r10
- jg LABEL(nibble_ashr_9)
-
-LABEL(gobble_ashr_9):
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $9, %xmm3
- pslldq $7, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
-
- add $16, %r10
- jg LABEL(nibble_ashr_9) /* cross page boundary */
-
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $9, %xmm3
- pslldq $7, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3 /* store for next cycle */
- jmp LABEL(loop_ashr_9)
-
- .p2align 4
-LABEL(nibble_ashr_9):
- pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
- pmovmskb %xmm0, %edx
- test $0xfe00, %edx
- jnz LABEL(ashr_9_exittail)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp $7, %r11
- jbe LABEL(ashr_9_exittail)
-#endif
-
- pxor %xmm0, %xmm0
- sub $0x1000, %r10
- jmp LABEL(gobble_ashr_9)
-
- .p2align 4
-LABEL(ashr_9_exittail):
- movdqa (%rsi, %rcx), %xmm1
- psrldq $9, %xmm0
- psrldq $9, %xmm3
- jmp LABEL(aftertail)
-
-/*
- * The following cases will be handled by ashr_10
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
- */
- .p2align 4
-LABEL(ashr_10):
- pxor %xmm0, %xmm0
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pslldq $6, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- pxor %xmm0, %xmm0
- mov $16, %rcx /* index for loads */
- mov $10, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 10(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- .p2align 4
-LABEL(loop_ashr_10):
- add $16, %r10
- jg LABEL(nibble_ashr_10)
-
-LABEL(gobble_ashr_10):
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $10, %xmm3
- pslldq $6, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
-
- add $16, %r10
- jg LABEL(nibble_ashr_10) /* cross page boundary */
-
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $10, %xmm3
- pslldq $6, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
- jmp LABEL(loop_ashr_10)
-
- .p2align 4
-LABEL(nibble_ashr_10):
- pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
- pmovmskb %xmm0, %edx
- test $0xfc00, %edx
- jnz LABEL(ashr_10_exittail)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp $6, %r11
- jbe LABEL(ashr_10_exittail)
-#endif
-
- pxor %xmm0, %xmm0
- sub $0x1000, %r10
- jmp LABEL(gobble_ashr_10)
-
- .p2align 4
-LABEL(ashr_10_exittail):
- movdqa (%rsi, %rcx), %xmm1
- psrldq $10, %xmm0
- psrldq $10, %xmm3
- jmp LABEL(aftertail)
-
-/*
- * The following cases will be handled by ashr_11
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
- */
- .p2align 4
-LABEL(ashr_11):
- pxor %xmm0, %xmm0
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pslldq $5, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- pxor %xmm0, %xmm0
- mov $16, %rcx /* index for loads */
- mov $11, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 11(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- .p2align 4
-LABEL(loop_ashr_11):
- add $16, %r10
- jg LABEL(nibble_ashr_11)
-
-LABEL(gobble_ashr_11):
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $11, %xmm3
- pslldq $5, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
-
- add $16, %r10
- jg LABEL(nibble_ashr_11) /* cross page boundary */
-
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $11, %xmm3
- pslldq $5, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
- jmp LABEL(loop_ashr_11)
-
- .p2align 4
-LABEL(nibble_ashr_11):
- pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
- pmovmskb %xmm0, %edx
- test $0xf800, %edx
- jnz LABEL(ashr_11_exittail)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp $5, %r11
- jbe LABEL(ashr_11_exittail)
-#endif
-
- pxor %xmm0, %xmm0
- sub $0x1000, %r10
- jmp LABEL(gobble_ashr_11)
-
- .p2align 4
-LABEL(ashr_11_exittail):
- movdqa (%rsi, %rcx), %xmm1
- psrldq $11, %xmm0
- psrldq $11, %xmm3
- jmp LABEL(aftertail)
-
-/*
- * The following cases will be handled by ashr_12
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
- */
- .p2align 4
-LABEL(ashr_12):
- pxor %xmm0, %xmm0
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pslldq $4, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- pxor %xmm0, %xmm0
- mov $16, %rcx /* index for loads */
- mov $12, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 12(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- .p2align 4
-LABEL(loop_ashr_12):
- add $16, %r10
- jg LABEL(nibble_ashr_12)
-
-LABEL(gobble_ashr_12):
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $12, %xmm3
- pslldq $4, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
-
- add $16, %r10
- jg LABEL(nibble_ashr_12) /* cross page boundary */
-
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $12, %xmm3
- pslldq $4, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
- jmp LABEL(loop_ashr_12)
-
- .p2align 4
-LABEL(nibble_ashr_12):
- pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
- pmovmskb %xmm0, %edx
- test $0xf000, %edx
- jnz LABEL(ashr_12_exittail)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp $4, %r11
- jbe LABEL(ashr_12_exittail)
-#endif
-
- pxor %xmm0, %xmm0
- sub $0x1000, %r10
- jmp LABEL(gobble_ashr_12)
-
- .p2align 4
-LABEL(ashr_12_exittail):
- movdqa (%rsi, %rcx), %xmm1
- psrldq $12, %xmm0
- psrldq $12, %xmm3
- jmp LABEL(aftertail)
-
-/*
- * The following cases will be handled by ashr_13
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
- */
- .p2align 4
-LABEL(ashr_13):
- pxor %xmm0, %xmm0
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pslldq $3, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- pxor %xmm0, %xmm0
- mov $16, %rcx /* index for loads */
- mov $13, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 13(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- .p2align 4
-LABEL(loop_ashr_13):
- add $16, %r10
- jg LABEL(nibble_ashr_13)
-
-LABEL(gobble_ashr_13):
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $13, %xmm3
- pslldq $3, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
-
- add $16, %r10
- jg LABEL(nibble_ashr_13) /* cross page boundary */
-
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $13, %xmm3
- pslldq $3, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
- jmp LABEL(loop_ashr_13)
-
- .p2align 4
-LABEL(nibble_ashr_13):
- pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
- pmovmskb %xmm0, %edx
- test $0xe000, %edx
- jnz LABEL(ashr_13_exittail)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp $3, %r11
- jbe LABEL(ashr_13_exittail)
-#endif
-
- pxor %xmm0, %xmm0
- sub $0x1000, %r10
- jmp LABEL(gobble_ashr_13)
-
- .p2align 4
-LABEL(ashr_13_exittail):
- movdqa (%rsi, %rcx), %xmm1
- psrldq $13, %xmm0
- psrldq $13, %xmm3
- jmp LABEL(aftertail)
-
-/*
- * The following cases will be handled by ashr_14
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
- */
- .p2align 4
-LABEL(ashr_14):
- pxor %xmm0, %xmm0
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pslldq $2, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- pxor %xmm0, %xmm0
- mov $16, %rcx /* index for loads */
- mov $14, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 14(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- .p2align 4
-LABEL(loop_ashr_14):
- add $16, %r10
- jg LABEL(nibble_ashr_14)
-
-LABEL(gobble_ashr_14):
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $14, %xmm3
- pslldq $2, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
-
- add $16, %r10
- jg LABEL(nibble_ashr_14) /* cross page boundary */
-
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $14, %xmm3
- pslldq $2, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
- jmp LABEL(loop_ashr_14)
-
- .p2align 4
-LABEL(nibble_ashr_14):
- pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
- pmovmskb %xmm0, %edx
- test $0xc000, %edx
- jnz LABEL(ashr_14_exittail)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp $2, %r11
- jbe LABEL(ashr_14_exittail)
-#endif
-
- pxor %xmm0, %xmm0
- sub $0x1000, %r10
- jmp LABEL(gobble_ashr_14)
-
- .p2align 4
-LABEL(ashr_14_exittail):
- movdqa (%rsi, %rcx), %xmm1
- psrldq $14, %xmm0
- psrldq $14, %xmm3
- jmp LABEL(aftertail)
-
-/*
- * The following cases will be handled by ashr_15
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
- */
- .p2align 4
-LABEL(ashr_15):
- pxor %xmm0, %xmm0
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, %xmm0
- pslldq $1, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
-
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- pxor %xmm0, %xmm0
- mov $16, %rcx /* index for loads */
- mov $15, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 15(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
-
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- .p2align 4
-LABEL(loop_ashr_15):
- add $16, %r10
- jg LABEL(nibble_ashr_15)
-
-LABEL(gobble_ashr_15):
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $15, %xmm3
- pslldq $1, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
-
- add $16, %r10
- jg LABEL(nibble_ashr_15) /* cross page boundary */
-
- movdqa (%rsi, %rcx), %xmm1
- movdqa (%rdi, %rcx), %xmm2
- movdqa %xmm2, %xmm4
-
- psrldq $15, %xmm3
- pslldq $1, %xmm2
- por %xmm3, %xmm2 /* merge into one 16byte value */
-
- TOLOWER (%xmm1, %xmm2)
-
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx
- jnz LABEL(exit)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rcx
- movdqa %xmm4, %xmm3
- jmp LABEL(loop_ashr_15)
-
- .p2align 4
-LABEL(nibble_ashr_15):
- pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
- pmovmskb %xmm0, %edx
- test $0x8000, %edx
- jnz LABEL(ashr_15_exittail)
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmpq $1, %r11
- jbe LABEL(ashr_15_exittail)
-#endif
-
- pxor %xmm0, %xmm0
- sub $0x1000, %r10
- jmp LABEL(gobble_ashr_15)
-
- .p2align 4
-LABEL(ashr_15_exittail):
- movdqa (%rsi, %rcx), %xmm1
- psrldq $15, %xmm3
- psrldq $15, %xmm0
-
- .p2align 4
-LABEL(aftertail):
- TOLOWER (%xmm1, %xmm3)
- pcmpeqb %xmm3, %xmm1
- psubb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- not %edx
-
- .p2align 4
-LABEL(exit):
- lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
-LABEL(less32bytes):
- lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
- lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
- test %r8d, %r8d
- jz LABEL(ret)
- xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
-
- .p2align 4
-LABEL(ret):
-LABEL(less16bytes):
- bsf %rdx, %rdx /* find and store bit index in %rdx */
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub %rdx, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- movzbl (%rsi, %rdx), %ecx
- movzbl (%rdi, %rdx), %eax
-
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
- movl (%rdx,%rcx,4), %ecx
- movl (%rdx,%rax,4), %eax
-#endif
-
- sub %ecx, %eax
- ret
-
-LABEL(strcmp_exitz):
- xor %eax, %eax
- ret
-
- .p2align 4
-LABEL(Byte0):
- movzbl (%rsi), %ecx
- movzbl (%rdi), %eax
-
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
- movl (%rdx,%rcx,4), %ecx
- movl (%rdx,%rax,4), %eax
-#endif
-
- sub %ecx, %eax
- ret
-END (STRCMP)
-
- .section .rodata,"a",@progbits
- .p2align 3
-LABEL(unaligned_table):
- .int LABEL(ashr_1) - LABEL(unaligned_table)
- .int LABEL(ashr_2) - LABEL(unaligned_table)
- .int LABEL(ashr_3) - LABEL(unaligned_table)
- .int LABEL(ashr_4) - LABEL(unaligned_table)
- .int LABEL(ashr_5) - LABEL(unaligned_table)
- .int LABEL(ashr_6) - LABEL(unaligned_table)
- .int LABEL(ashr_7) - LABEL(unaligned_table)
- .int LABEL(ashr_8) - LABEL(unaligned_table)
- .int LABEL(ashr_9) - LABEL(unaligned_table)
- .int LABEL(ashr_10) - LABEL(unaligned_table)
- .int LABEL(ashr_11) - LABEL(unaligned_table)
- .int LABEL(ashr_12) - LABEL(unaligned_table)
- .int LABEL(ashr_13) - LABEL(unaligned_table)
- .int LABEL(ashr_14) - LABEL(unaligned_table)
- .int LABEL(ashr_15) - LABEL(unaligned_table)
- .int LABEL(ashr_0) - LABEL(unaligned_table)
-libc_hidden_builtin_def (STRCMP)
+#include "multiarch/strcmp-sse2.S"
+libc_hidden_builtin_def (strcmp)
diff --git a/sysdeps/x86_64/strncase_l.S b/sysdeps/x86_64/strncase_l.S
index c725cd85b3..3780fc50b1 100644
--- a/sysdeps/x86_64/strncase_l.S
+++ b/sysdeps/x86_64/strncase_l.S
@@ -1,6 +1,11 @@
-#define STRCMP __strncasecmp_l
-#define USE_AS_STRNCASECMP_L
-#include "strcmp.S"
+/* Symbols = __strncasecmp_l and __strncasecmp. */
+
+#include "multiarch/strncase_l-sse2.S"
+
+libc_hidden_builtin_def (__strncasecmp_l)
weak_alias (__strncasecmp_l, strncasecmp_l)
libc_hidden_def (strncasecmp_l)
+
+weak_alias (__strncasecmp, strncasecmp)
+libc_hidden_def (__strncasecmp)
diff --git a/sysdeps/x86_64/strncmp.S b/sysdeps/x86_64/strncmp.S
index 0af34e7f15..13d9e82ee2 100644
--- a/sysdeps/x86_64/strncmp.S
+++ b/sysdeps/x86_64/strncmp.S
@@ -1,3 +1,4 @@
-#define STRCMP strncmp
-#define USE_AS_STRNCMP
-#include "strcmp.S"
+/* Symbol = strncmp. */
+
+#include "multiarch/strncmp-sse2.S"
+libc_hidden_builtin_def (strncmp)
--
2.34.1
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v1 2/4] x86: Move strcmp SSE2 implementation to multiarch/strcmp-sse2.S
2022-07-12 19:28 ` [PATCH v1 2/4] x86: Move strcmp SSE2 implementation to multiarch/strcmp-sse2.S Noah Goldstein
@ 2022-07-12 23:57 ` H.J. Lu
0 siblings, 0 replies; 8+ messages in thread
From: H.J. Lu @ 2022-07-12 23:57 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jul 12, 2022 at 12:28 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Because strcmp-sse2.S implements so many functions (more from
> avx2/evex/sse42) add a new file 'strcmp-naming.h' to assist in
> getting the correct symbol name for all the function across
> multiarch/non-multiarch builds.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
> sysdeps/x86_64/multiarch/rtld-strcmp.S | 18 +
> sysdeps/x86_64/multiarch/rtld-strncmp.S | 18 +
> sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S | 5 +-
> sysdeps/x86_64/multiarch/strcmp-naming.h | 68 +
> sysdeps/x86_64/multiarch/strcmp-sse2.S | 2140 ++++++++++++++++-
> sysdeps/x86_64/multiarch/strncase_l-sse2.S | 5 +-
> sysdeps/x86_64/multiarch/strncmp-sse2.S | 12 +-
> sysdeps/x86_64/strcasecmp_l.S | 11 +-
> sysdeps/x86_64/strcmp.S | 2147 +-----------------
> sysdeps/x86_64/strncase_l.S | 11 +-
> sysdeps/x86_64/strncmp.S | 7 +-
> 11 files changed, 2264 insertions(+), 2178 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/rtld-strcmp.S
> create mode 100644 sysdeps/x86_64/multiarch/rtld-strncmp.S
> create mode 100644 sysdeps/x86_64/multiarch/strcmp-naming.h
>
> diff --git a/sysdeps/x86_64/multiarch/rtld-strcmp.S b/sysdeps/x86_64/multiarch/rtld-strcmp.S
> new file mode 100644
> index 0000000000..207078bdcc
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rtld-strcmp.S
> @@ -0,0 +1,18 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "../strcmp.S"
> diff --git a/sysdeps/x86_64/multiarch/rtld-strncmp.S b/sysdeps/x86_64/multiarch/rtld-strncmp.S
> new file mode 100644
> index 0000000000..ac32150406
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rtld-strncmp.S
> @@ -0,0 +1,18 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "../strncmp.S"
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S
> index 2360d104dd..a2b5741399 100644
> --- a/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse2.S
> @@ -16,8 +16,5 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#define STRCMP __strcasecmp_l_sse2
> #define USE_AS_STRCASECMP_L
> -#define NO_NOLOCALE_ALIAS
> -#define __strcasecmp __strcasecmp_sse2
> -#include <sysdeps/x86_64/strcmp.S>
> +#include "strcmp-sse2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-naming.h b/sysdeps/x86_64/multiarch/strcmp-naming.h
> new file mode 100644
> index 0000000000..6a7529b6a4
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strcmp-naming.h
> @@ -0,0 +1,68 @@
> +#ifndef _STRCMP_NAMING_H_
> +#define _STRCMP_NAMING_H_
> +
> +/* Utility macros. */
> +#define STRCMP_SUFFIX(x, y) x##y
> +#define STRCMP_NAME(x, y) STRCMP_SUFFIX (x, y)
> +
> +/* Setup base of all definitions. */
> +#define STRNCASECMP_BASE __strncasecmp
> +#define STRCASECMP_BASE __strcasecmp
> +#define WCSCMP_BASE __wcscmp
> +
> +#if defined USE_MULTIARCH && IS_IN (libc)
> +# define WCSNCMP_BASE __wcsncmp
> +# define STRNCMP_BASE __strncmp
> +# define STRCMP_BASE __strcmp
> +
> +#else
> +/* Covers IS_IN (rtld) or non-multiarch build. */
> +# define WCSNCMP_BASE wcsncmp
> +# define STRNCMP_BASE strncmp
> +# define STRCMP_BASE strcmp
> +
> +# undef STRCMP_ISA
> +# define STRCMP_ISA
> +#endif
> +
> +#if IS_IN (rtld) || defined USE_MULTIARCH
> +# define ISA_HIDDEN_JUMPTARGET(...) __VA_ARGS__
> +#else
> +# define ISA_HIDDEN_JUMPTARGET(...) HIDDEN_JUMPTARGET (__VA_ARGS__)
> +#endif
> +
> +/* Get correct symbol for OVERFLOW_STRCMP, STRCMP, and
> + STRCASECMP. */
> +#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> +
> +# if defined USE_AS_WCSCMP || defined USE_AS_WCSNCMP
> +# define OVERFLOW_STRCMP_SYM WCSCMP_BASE
> +# define STRCMP_SYM WCSNCMP_BASE
> +# elif defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> +# define OVERFLOW_STRCMP_SYM STRCMP_NAME (STRCASECMP_BASE, _l)
> +# define STRCMP_SYM STRCMP_NAME (STRNCASECMP_BASE, _l)
> +# else
> +# define OVERFLOW_STRCMP_SYM STRCMP_BASE
> +# define STRCMP_SYM STRNCMP_BASE
> +# endif
> +
> +# define STRCASECMP_SYM STRNCASECMP_BASE
> +# define OVERFLOW_STRCMP \
> + ISA_HIDDEN_JUMPTARGET (STRCMP_NAME (OVERFLOW_STRCMP_SYM, STRCMP_ISA))
> +#else
> +# ifdef USE_AS_WCSCMP
> +# define STRCMP_SYM WCSCMP_BASE
> +# elif defined USE_AS_STRCASECMP_L
> +# define STRCMP_SYM STRCMP_NAME (STRCASECMP_BASE, _l)
> +# else
> +# define STRCMP_SYM STRCMP_BASE
> +# endif
> +
> +# define STRCASECMP_SYM STRCASECMP_BASE
> +#endif
> +
> +#define STRCASECMP_L_NONASCII STRCMP_NAME (STRCASECMP_SYM, _l_nonascii)
> +#define STRCASECMP STRCMP_NAME (STRCASECMP_SYM, STRCMP_ISA)
> +#define STRCMP STRCMP_NAME (STRCMP_SYM, STRCMP_ISA)
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2.S b/sysdeps/x86_64/multiarch/strcmp-sse2.S
> index b8f95e59cf..b1220231ab 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-sse2.S
> @@ -16,13 +16,2141 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#if IS_IN (libc)
> +#if IS_IN (libc) || IS_IN (rtld)
> +
> +# define STRCMP_ISA _sse2
> +# include "strcmp-naming.h"
> +
> # include <sysdep.h>
>
> -# define STRCMP __strcmp_sse2
> +# undef UPDATE_STRNCMP_COUNTER
>
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strcmp)
> -#endif
> +# ifndef LABEL
> +# define LABEL(l) L(l)
> +# endif
> +
> +# ifdef USE_AS_STRNCMP
> +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
> + if the new counter > the old one or is 0. */
> +# define UPDATE_STRNCMP_COUNTER \
> + /* calculate left number to compare */ \
> + lea -16(%rcx, %r11), %r9; \
> + cmp %r9, %r11; \
> + jb LABEL(strcmp_exitz); \
> + test %r9, %r9; \
> + je LABEL(strcmp_exitz); \
> + mov %r9, %r11
> +
> +# elif defined USE_AS_STRCASECMP_L
> +# include "locale-defines.h"
> +
> +# define UPDATE_STRNCMP_COUNTER
> +# elif defined USE_AS_STRNCASECMP_L
> +# include "locale-defines.h"
> +
> +# define UPDATE_STRNCMP_COUNTER \
> + /* calculate left number to compare */ \
> + lea -16(%rcx, %r11), %r9; \
> + cmp %r9, %r11; \
> + jb LABEL(strcmp_exitz); \
> + test %r9, %r9; \
> + je LABEL(strcmp_exitz); \
> + mov %r9, %r11
> +# else
> +# define UPDATE_STRNCMP_COUNTER
> +# endif
> +
> + .text
> +# ifdef USE_AS_STRCASECMP_L
> +# ifndef ENTRY2
> +# define ENTRY2(name) ENTRY (name)
> +# define END2(name) END (name)
> +# endif
> +
> +ENTRY2 (STRCASECMP)
> + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> + mov %fs:(%rax),%RDX_LP
> +
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> +END2 (STRCASECMP)
> + /* FALLTHROUGH to strcasecmp_l. */
> +# elif defined USE_AS_STRNCASECMP_L
> +# ifndef ENTRY2
> +# define ENTRY2(name) ENTRY (name)
> +# define END2(name) END (name)
> +# endif
> +
> +ENTRY2 (STRCASECMP)
> + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> + mov %fs:(%rax),%RCX_LP
> +
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> +END2 (STRCASECMP)
> + /* FALLTHROUGH to strncasecmp_l. */
> +# endif
> +
> +ENTRY (STRCMP)
> +# ifdef USE_AS_STRCASECMP_L
> + /* We have to fall back on the C implementation for locales
> + with encodings not matching ASCII for single bytes. */
> +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
> +# else
> + mov (%rdx), %RAX_LP
> +# endif
> + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
> + jne __strcasecmp_l_nonascii
> +# elif defined USE_AS_STRNCASECMP_L
> + /* We have to fall back on the C implementation for locales
> + with encodings not matching ASCII for single bytes. */
> +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
> +# else
> + mov (%rcx), %RAX_LP
> +# endif
> + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
> + jne __strncasecmp_l_nonascii
> +# endif
> +
> +/*
> + * This implementation uses SSE to compare up to 16 bytes at a time.
> + */
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + test %RDX_LP, %RDX_LP
> + je LABEL(strcmp_exitz)
> + cmp $1, %RDX_LP
> + je LABEL(Byte0)
> + mov %RDX_LP, %R11_LP
> +# endif
> + mov %esi, %ecx
> + mov %edi, %eax
> +/* Use 64bit AND here to avoid long NOP padding. */
> + and $0x3f, %rcx /* rsi alignment in cache line */
> + and $0x3f, %rax /* rdi alignment in cache line */
> +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> + .section .rodata.cst16,"aM",@progbits,16
> + .align 16
> +.Llcase_min:
> + .quad 0x3f3f3f3f3f3f3f3f
> + .quad 0x3f3f3f3f3f3f3f3f
> +.Llcase_max:
> + .quad 0x9999999999999999
> + .quad 0x9999999999999999
> +.Lcase_add:
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .previous
> + movdqa .Llcase_min(%rip), %xmm5
> +# define LCASE_MIN_reg %xmm5
> + movdqa .Llcase_max(%rip), %xmm6
> +# define LCASE_MAX_reg %xmm6
> + movdqa .Lcase_add(%rip), %xmm7
> +# define CASE_ADD_reg %xmm7
> +# endif
> + cmp $0x30, %ecx
> + ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
> + cmp $0x30, %eax
> + ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */
> + movlpd (%rdi), %xmm1
> + movlpd (%rsi), %xmm2
> + movhpd 8(%rdi), %xmm1
> + movhpd 8(%rsi), %xmm2
> +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> +# define TOLOWER(reg1, reg2) \
> + movdqa LCASE_MIN_reg, %xmm8; \
> + movdqa LCASE_MIN_reg, %xmm9; \
> + paddb reg1, %xmm8; \
> + paddb reg2, %xmm9; \
> + pcmpgtb LCASE_MAX_reg, %xmm8; \
> + pcmpgtb LCASE_MAX_reg, %xmm9; \
> + pandn CASE_ADD_reg, %xmm8; \
> + pandn CASE_ADD_reg, %xmm9; \
> + paddb %xmm8, reg1; \
> + paddb %xmm9, reg2
> + TOLOWER (%xmm1, %xmm2)
> +# else
> +# define TOLOWER(reg1, reg2)
> +# endif
> + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
> + pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> + pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
> + jnz LABEL(less16bytes) /* If not, find different value or null char */
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz) /* finish comparision */
> +# endif
> + add $16, %rsi /* prepare to search next 16 bytes */
> + add $16, %rdi /* prepare to search next 16 bytes */
> +
> + /*
> + * Determine source and destination string offsets from 16-byte alignment.
> + * Use relative offset difference between the two to determine which case
> + * below to use.
> + */
> + .p2align 4
> +LABEL(crosscache):
> + and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
> + and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
> + mov $0xffff, %edx /* for equivalent offset */
> + xor %r8d, %r8d
> + and $0xf, %ecx /* offset of rsi */
> + and $0xf, %eax /* offset of rdi */
> + cmp %eax, %ecx
> + je LABEL(ashr_0) /* rsi and rdi relative offset same */
> + ja LABEL(bigger)
> + mov %edx, %r8d /* r8d is offset flag for exit tail */
> + xchg %ecx, %eax
> + xchg %rsi, %rdi
> +LABEL(bigger):
> + lea 15(%rax), %r9
> + sub %rcx, %r9
> + lea LABEL(unaligned_table)(%rip), %r10
> + movslq (%r10, %r9,4), %r9
> + lea (%r10, %r9), %r10
> + _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
> +
> +/*
> + * The following cases will be handled by ashr_0
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(0~15) n(0~15) 15(15+ n-n) ashr_0
> + */
> + .p2align 4
> +LABEL(ashr_0):
> +
> + movdqa (%rsi), %xmm1
> + pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
> + pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
> +# else
> + movdqa (%rdi), %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
> +# endif
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %r9d
> + shr %cl, %edx /* adjust 0xffff for offset */
> + shr %cl, %r9d /* adjust for 16-byte offset */
> + sub %r9d, %edx
> + /*
> + * edx must be the same with r9d if in left byte (16-rcx) is equal to
> + * the start from (16-rax) and no null char was seen.
> + */
> + jne LABEL(less32bytes) /* mismatch or null char */
> + UPDATE_STRNCMP_COUNTER
> + mov $16, %rcx
> + mov $16, %r9
> + pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
> +
> + /*
> + * Now both strings are aligned at 16-byte boundary. Loop over strings
> + * checking 32-bytes per iteration.
> + */
> + .p2align 4
> +LABEL(loop_ashr_0):
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit) /* mismatch or null char seen */
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rcx
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rcx
> + jmp LABEL(loop_ashr_0)
> +
> +/*
> + * The following cases will be handled by ashr_1
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(15) n -15 0(15 +(n-15) - n) ashr_1
> + */
> + .p2align 4
> +LABEL(ashr_1):
> + pxor %xmm0, %xmm0
> + movdqa (%rdi), %xmm2
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> + pslldq $15, %xmm2 /* shift first string to align with second */
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
> + psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx /* adjust 0xffff for offset */
> + shr %cl, %r9d /* adjust for 16-byte offset */
> + sub %r9d, %edx
> + jnz LABEL(less32bytes) /* mismatch or null char seen */
> + movdqa (%rdi), %xmm3
> + UPDATE_STRNCMP_COUNTER
> +
> + pxor %xmm0, %xmm0
> + mov $16, %rcx /* index for loads*/
> + mov $1, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 1(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + .p2align 4
> +LABEL(loop_ashr_1):
> + add $16, %r10
> + jg LABEL(nibble_ashr_1) /* cross page boundary */
> +
> +LABEL(gobble_ashr_1):
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4 /* store for next cycle */
> +
> + psrldq $1, %xmm3
> + pslldq $15, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> +
> + add $16, %r10
> + jg LABEL(nibble_ashr_1) /* cross page boundary */
> +
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4 /* store for next cycle */
> +
> + psrldq $1, %xmm3
> + pslldq $15, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> + jmp LABEL(loop_ashr_1)
> +
> + /*
> + * Nibble avoids loads across page boundary. This is to avoid a potential
> + * access into unmapped memory.
> + */
> + .p2align 4
> +LABEL(nibble_ashr_1):
> + pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/
> + pmovmskb %xmm0, %edx
> + test $0xfffe, %edx
> + jnz LABEL(ashr_1_exittail) /* find null char*/
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp $15, %r11
> + jbe LABEL(ashr_1_exittail)
> +# endif
> +
> + pxor %xmm0, %xmm0
> + sub $0x1000, %r10 /* substract 4K from %r10 */
> + jmp LABEL(gobble_ashr_1)
> +
> + /*
> + * Once find null char, determine if there is a string mismatch
> + * before the null char.
> + */
> + .p2align 4
> +LABEL(ashr_1_exittail):
> + movdqa (%rsi, %rcx), %xmm1
> + psrldq $1, %xmm0
> + psrldq $1, %xmm3
> + jmp LABEL(aftertail)
> +
> +/*
> + * The following cases will be handled by ashr_2
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
> + */
> + .p2align 4
> +LABEL(ashr_2):
> + pxor %xmm0, %xmm0
> + movdqa (%rdi), %xmm2
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0
> + pslldq $14, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> + UPDATE_STRNCMP_COUNTER
> +
> + pxor %xmm0, %xmm0
> + mov $16, %rcx /* index for loads */
> + mov $2, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 2(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + .p2align 4
> +LABEL(loop_ashr_2):
> + add $16, %r10
> + jg LABEL(nibble_ashr_2)
> +
> +LABEL(gobble_ashr_2):
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $2, %xmm3
> + pslldq $14, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> +
> + add $16, %r10
> + jg LABEL(nibble_ashr_2) /* cross page boundary */
> +
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $2, %xmm3
> + pslldq $14, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> + jmp LABEL(loop_ashr_2)
> +
> + .p2align 4
> +LABEL(nibble_ashr_2):
> + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> + pmovmskb %xmm0, %edx
> + test $0xfffc, %edx
> + jnz LABEL(ashr_2_exittail)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp $14, %r11
> + jbe LABEL(ashr_2_exittail)
> +# endif
> +
> + pxor %xmm0, %xmm0
> + sub $0x1000, %r10
> + jmp LABEL(gobble_ashr_2)
> +
> + .p2align 4
> +LABEL(ashr_2_exittail):
> + movdqa (%rsi, %rcx), %xmm1
> + psrldq $2, %xmm0
> + psrldq $2, %xmm3
> + jmp LABEL(aftertail)
> +
> +/*
> + * The following cases will be handled by ashr_3
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
> + */
> + .p2align 4
> +LABEL(ashr_3):
> + pxor %xmm0, %xmm0
> + movdqa (%rdi), %xmm2
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0
> + pslldq $13, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + pxor %xmm0, %xmm0
> + mov $16, %rcx /* index for loads */
> + mov $3, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 3(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + .p2align 4
> +LABEL(loop_ashr_3):
> + add $16, %r10
> + jg LABEL(nibble_ashr_3)
> +
> +LABEL(gobble_ashr_3):
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $3, %xmm3
> + pslldq $13, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> +
> + add $16, %r10
> + jg LABEL(nibble_ashr_3) /* cross page boundary */
> +
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $3, %xmm3
> + pslldq $13, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> + jmp LABEL(loop_ashr_3)
> +
> + .p2align 4
> +LABEL(nibble_ashr_3):
> + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> + pmovmskb %xmm0, %edx
> + test $0xfff8, %edx
> + jnz LABEL(ashr_3_exittail)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp $13, %r11
> + jbe LABEL(ashr_3_exittail)
> +# endif
> +
> + pxor %xmm0, %xmm0
> + sub $0x1000, %r10
> + jmp LABEL(gobble_ashr_3)
> +
> + .p2align 4
> +LABEL(ashr_3_exittail):
> + movdqa (%rsi, %rcx), %xmm1
> + psrldq $3, %xmm0
> + psrldq $3, %xmm3
> + jmp LABEL(aftertail)
> +
> +/*
> + * The following cases will be handled by ashr_4
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
> + */
> + .p2align 4
> +LABEL(ashr_4):
> + pxor %xmm0, %xmm0
> + movdqa (%rdi), %xmm2
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0
> + pslldq $12, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + pxor %xmm0, %xmm0
> + mov $16, %rcx /* index for loads */
> + mov $4, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 4(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + .p2align 4
> +LABEL(loop_ashr_4):
> + add $16, %r10
> + jg LABEL(nibble_ashr_4)
> +
> +LABEL(gobble_ashr_4):
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $4, %xmm3
> + pslldq $12, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> +
> + add $16, %r10
> + jg LABEL(nibble_ashr_4) /* cross page boundary */
> +
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $4, %xmm3
> + pslldq $12, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> + jmp LABEL(loop_ashr_4)
> +
> + .p2align 4
> +LABEL(nibble_ashr_4):
> + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> + pmovmskb %xmm0, %edx
> + test $0xfff0, %edx
> + jnz LABEL(ashr_4_exittail)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp $12, %r11
> + jbe LABEL(ashr_4_exittail)
> +# endif
> +
> + pxor %xmm0, %xmm0
> + sub $0x1000, %r10
> + jmp LABEL(gobble_ashr_4)
> +
> + .p2align 4
> +LABEL(ashr_4_exittail):
> + movdqa (%rsi, %rcx), %xmm1
> + psrldq $4, %xmm0
> + psrldq $4, %xmm3
> + jmp LABEL(aftertail)
> +
> +/*
> + * The following cases will be handled by ashr_5
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
> + */
> + .p2align 4
> +LABEL(ashr_5):
> + pxor %xmm0, %xmm0
> + movdqa (%rdi), %xmm2
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0
> + pslldq $11, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + pxor %xmm0, %xmm0
> + mov $16, %rcx /* index for loads */
> + mov $5, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 5(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + .p2align 4
> +LABEL(loop_ashr_5):
> + add $16, %r10
> + jg LABEL(nibble_ashr_5)
> +
> +LABEL(gobble_ashr_5):
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $5, %xmm3
> + pslldq $11, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> +
> + add $16, %r10
> + jg LABEL(nibble_ashr_5) /* cross page boundary */
> +
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $5, %xmm3
> + pslldq $11, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> + jmp LABEL(loop_ashr_5)
> +
> + .p2align 4
> +LABEL(nibble_ashr_5):
> + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> + pmovmskb %xmm0, %edx
> + test $0xffe0, %edx
> + jnz LABEL(ashr_5_exittail)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp $11, %r11
> + jbe LABEL(ashr_5_exittail)
> +# endif
> +
> + pxor %xmm0, %xmm0
> + sub $0x1000, %r10
> + jmp LABEL(gobble_ashr_5)
> +
> + .p2align 4
> +LABEL(ashr_5_exittail):
> + movdqa (%rsi, %rcx), %xmm1
> + psrldq $5, %xmm0
> + psrldq $5, %xmm3
> + jmp LABEL(aftertail)
> +
> +/*
> + * The following cases will be handled by ashr_6
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
> + */
> + .p2align 4
> +LABEL(ashr_6):
> + pxor %xmm0, %xmm0
> + movdqa (%rdi), %xmm2
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0
> + pslldq $10, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + pxor %xmm0, %xmm0
> + mov $16, %rcx /* index for loads */
> + mov $6, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 6(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + .p2align 4
> +LABEL(loop_ashr_6):
> + add $16, %r10
> + jg LABEL(nibble_ashr_6)
> +
> +LABEL(gobble_ashr_6):
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $6, %xmm3
> + pslldq $10, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> +
> + add $16, %r10
> + jg LABEL(nibble_ashr_6) /* cross page boundary */
> +
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $6, %xmm3
> + pslldq $10, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> + jmp LABEL(loop_ashr_6)
> +
> + .p2align 4
> +LABEL(nibble_ashr_6):
> + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> + pmovmskb %xmm0, %edx
> + test $0xffc0, %edx
> + jnz LABEL(ashr_6_exittail)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp $10, %r11
> + jbe LABEL(ashr_6_exittail)
> +# endif
> +
> + pxor %xmm0, %xmm0
> + sub $0x1000, %r10
> + jmp LABEL(gobble_ashr_6)
> +
> + .p2align 4
> +LABEL(ashr_6_exittail):
> + movdqa (%rsi, %rcx), %xmm1
> + psrldq $6, %xmm0
> + psrldq $6, %xmm3
> + jmp LABEL(aftertail)
> +
> +/*
> + * The following cases will be handled by ashr_7
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
> + */
> + .p2align 4
> +LABEL(ashr_7):
> + pxor %xmm0, %xmm0
> + movdqa (%rdi), %xmm2
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0
> + pslldq $9, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + pxor %xmm0, %xmm0
> + mov $16, %rcx /* index for loads */
> + mov $7, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 7(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + .p2align 4
> +LABEL(loop_ashr_7):
> + add $16, %r10
> + jg LABEL(nibble_ashr_7)
> +
> +LABEL(gobble_ashr_7):
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $7, %xmm3
> + pslldq $9, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> +
> + add $16, %r10
> + jg LABEL(nibble_ashr_7) /* cross page boundary */
> +
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $7, %xmm3
> + pslldq $9, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> + jmp LABEL(loop_ashr_7)
> +
> + .p2align 4
> +LABEL(nibble_ashr_7):
> + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> + pmovmskb %xmm0, %edx
> + test $0xff80, %edx
> + jnz LABEL(ashr_7_exittail)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp $9, %r11
> + jbe LABEL(ashr_7_exittail)
> +# endif
> +
> + pxor %xmm0, %xmm0
> + sub $0x1000, %r10
> + jmp LABEL(gobble_ashr_7)
> +
> + .p2align 4
> +LABEL(ashr_7_exittail):
> + movdqa (%rsi, %rcx), %xmm1
> + psrldq $7, %xmm0
> + psrldq $7, %xmm3
> + jmp LABEL(aftertail)
> +
> +/*
> + * The following cases will be handled by ashr_8
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
> + */
> + .p2align 4
> +LABEL(ashr_8):
> + pxor %xmm0, %xmm0
> + movdqa (%rdi), %xmm2
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0
> + pslldq $8, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + pxor %xmm0, %xmm0
> + mov $16, %rcx /* index for loads */
> + mov $8, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 8(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + .p2align 4
> +LABEL(loop_ashr_8):
> + add $16, %r10
> + jg LABEL(nibble_ashr_8)
> +
> +LABEL(gobble_ashr_8):
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $8, %xmm3
> + pslldq $8, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
>
> -#include <sysdeps/x86_64/strcmp.S>
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> +
> + add $16, %r10
> + jg LABEL(nibble_ashr_8) /* cross page boundary */
> +
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $8, %xmm3
> + pslldq $8, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> + jmp LABEL(loop_ashr_8)
> +
> + .p2align 4
> +LABEL(nibble_ashr_8):
> + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> + pmovmskb %xmm0, %edx
> + test $0xff00, %edx
> + jnz LABEL(ashr_8_exittail)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp $8, %r11
> + jbe LABEL(ashr_8_exittail)
> +# endif
> +
> + pxor %xmm0, %xmm0
> + sub $0x1000, %r10
> + jmp LABEL(gobble_ashr_8)
> +
> + .p2align 4
> +LABEL(ashr_8_exittail):
> + movdqa (%rsi, %rcx), %xmm1
> + psrldq $8, %xmm0
> + psrldq $8, %xmm3
> + jmp LABEL(aftertail)
> +
> +/*
> + * The following cases will be handled by ashr_9
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
> + */
> + .p2align 4
> +LABEL(ashr_9):
> + pxor %xmm0, %xmm0
> + movdqa (%rdi), %xmm2
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0
> + pslldq $7, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + pxor %xmm0, %xmm0
> + mov $16, %rcx /* index for loads */
> + mov $9, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 9(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + .p2align 4
> +LABEL(loop_ashr_9):
> + add $16, %r10
> + jg LABEL(nibble_ashr_9)
> +
> +LABEL(gobble_ashr_9):
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $9, %xmm3
> + pslldq $7, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> +
> + add $16, %r10
> + jg LABEL(nibble_ashr_9) /* cross page boundary */
> +
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $9, %xmm3
> + pslldq $7, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3 /* store for next cycle */
> + jmp LABEL(loop_ashr_9)
> +
> + .p2align 4
> +LABEL(nibble_ashr_9):
> + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> + pmovmskb %xmm0, %edx
> + test $0xfe00, %edx
> + jnz LABEL(ashr_9_exittail)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp $7, %r11
> + jbe LABEL(ashr_9_exittail)
> +# endif
> +
> + pxor %xmm0, %xmm0
> + sub $0x1000, %r10
> + jmp LABEL(gobble_ashr_9)
> +
> + .p2align 4
> +LABEL(ashr_9_exittail):
> + movdqa (%rsi, %rcx), %xmm1
> + psrldq $9, %xmm0
> + psrldq $9, %xmm3
> + jmp LABEL(aftertail)
> +
> +/*
> + * The following cases will be handled by ashr_10
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
> + */
> + .p2align 4
> +LABEL(ashr_10):
> + pxor %xmm0, %xmm0
> + movdqa (%rdi), %xmm2
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0
> + pslldq $6, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + pxor %xmm0, %xmm0
> + mov $16, %rcx /* index for loads */
> + mov $10, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 10(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + .p2align 4
> +LABEL(loop_ashr_10):
> + add $16, %r10
> + jg LABEL(nibble_ashr_10)
> +
> +LABEL(gobble_ashr_10):
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $10, %xmm3
> + pslldq $6, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> +
> + add $16, %r10
> + jg LABEL(nibble_ashr_10) /* cross page boundary */
> +
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $10, %xmm3
> + pslldq $6, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> + jmp LABEL(loop_ashr_10)
> +
> + .p2align 4
> +LABEL(nibble_ashr_10):
> + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> + pmovmskb %xmm0, %edx
> + test $0xfc00, %edx
> + jnz LABEL(ashr_10_exittail)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp $6, %r11
> + jbe LABEL(ashr_10_exittail)
> +# endif
> +
> + pxor %xmm0, %xmm0
> + sub $0x1000, %r10
> + jmp LABEL(gobble_ashr_10)
> +
> + .p2align 4
> +LABEL(ashr_10_exittail):
> + movdqa (%rsi, %rcx), %xmm1
> + psrldq $10, %xmm0
> + psrldq $10, %xmm3
> + jmp LABEL(aftertail)
> +
> +/*
> + * The following cases will be handled by ashr_11
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
> + */
> + .p2align 4
> +LABEL(ashr_11):
> + pxor %xmm0, %xmm0
> + movdqa (%rdi), %xmm2
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0
> + pslldq $5, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + pxor %xmm0, %xmm0
> + mov $16, %rcx /* index for loads */
> + mov $11, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 11(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + .p2align 4
> +LABEL(loop_ashr_11):
> + add $16, %r10
> + jg LABEL(nibble_ashr_11)
> +
> +LABEL(gobble_ashr_11):
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $11, %xmm3
> + pslldq $5, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> +
> + add $16, %r10
> + jg LABEL(nibble_ashr_11) /* cross page boundary */
> +
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $11, %xmm3
> + pslldq $5, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> + jmp LABEL(loop_ashr_11)
> +
> + .p2align 4
> +LABEL(nibble_ashr_11):
> + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> + pmovmskb %xmm0, %edx
> + test $0xf800, %edx
> + jnz LABEL(ashr_11_exittail)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp $5, %r11
> + jbe LABEL(ashr_11_exittail)
> +# endif
> +
> + pxor %xmm0, %xmm0
> + sub $0x1000, %r10
> + jmp LABEL(gobble_ashr_11)
> +
> + .p2align 4
> +LABEL(ashr_11_exittail):
> + movdqa (%rsi, %rcx), %xmm1
> + psrldq $11, %xmm0
> + psrldq $11, %xmm3
> + jmp LABEL(aftertail)
> +
> +/*
> + * The following cases will be handled by ashr_12
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
> + */
> + .p2align 4
> +LABEL(ashr_12):
> + pxor %xmm0, %xmm0
> + movdqa (%rdi), %xmm2
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0
> + pslldq $4, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + pxor %xmm0, %xmm0
> + mov $16, %rcx /* index for loads */
> + mov $12, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 12(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + .p2align 4
> +LABEL(loop_ashr_12):
> + add $16, %r10
> + jg LABEL(nibble_ashr_12)
> +
> +LABEL(gobble_ashr_12):
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $12, %xmm3
> + pslldq $4, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> +
> + add $16, %r10
> + jg LABEL(nibble_ashr_12) /* cross page boundary */
> +
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $12, %xmm3
> + pslldq $4, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> + jmp LABEL(loop_ashr_12)
> +
> + .p2align 4
> +LABEL(nibble_ashr_12):
> + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> + pmovmskb %xmm0, %edx
> + test $0xf000, %edx
> + jnz LABEL(ashr_12_exittail)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp $4, %r11
> + jbe LABEL(ashr_12_exittail)
> +# endif
> +
> + pxor %xmm0, %xmm0
> + sub $0x1000, %r10
> + jmp LABEL(gobble_ashr_12)
> +
> + .p2align 4
> +LABEL(ashr_12_exittail):
> + movdqa (%rsi, %rcx), %xmm1
> + psrldq $12, %xmm0
> + psrldq $12, %xmm3
> + jmp LABEL(aftertail)
> +
> +/*
> + * The following cases will be handled by ashr_13
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
> + */
> + .p2align 4
> +LABEL(ashr_13):
> + pxor %xmm0, %xmm0
> + movdqa (%rdi), %xmm2
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0
> + pslldq $3, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + pxor %xmm0, %xmm0
> + mov $16, %rcx /* index for loads */
> + mov $13, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 13(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + .p2align 4
> +LABEL(loop_ashr_13):
> + add $16, %r10
> + jg LABEL(nibble_ashr_13)
> +
> +LABEL(gobble_ashr_13):
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $13, %xmm3
> + pslldq $3, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> +
> + add $16, %r10
> + jg LABEL(nibble_ashr_13) /* cross page boundary */
> +
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $13, %xmm3
> + pslldq $3, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> + jmp LABEL(loop_ashr_13)
> +
> + .p2align 4
> +LABEL(nibble_ashr_13):
> + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> + pmovmskb %xmm0, %edx
> + test $0xe000, %edx
> + jnz LABEL(ashr_13_exittail)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp $3, %r11
> + jbe LABEL(ashr_13_exittail)
> +# endif
> +
> + pxor %xmm0, %xmm0
> + sub $0x1000, %r10
> + jmp LABEL(gobble_ashr_13)
> +
> + .p2align 4
> +LABEL(ashr_13_exittail):
> + movdqa (%rsi, %rcx), %xmm1
> + psrldq $13, %xmm0
> + psrldq $13, %xmm3
> + jmp LABEL(aftertail)
> +
> +/*
> + * The following cases will be handled by ashr_14
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
> + */
> + .p2align 4
> +LABEL(ashr_14):
> + pxor %xmm0, %xmm0
> + movdqa (%rdi), %xmm2
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0
> + pslldq $2, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + pxor %xmm0, %xmm0
> + mov $16, %rcx /* index for loads */
> + mov $14, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 14(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + .p2align 4
> +LABEL(loop_ashr_14):
> + add $16, %r10
> + jg LABEL(nibble_ashr_14)
> +
> +LABEL(gobble_ashr_14):
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $14, %xmm3
> + pslldq $2, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> +
> + add $16, %r10
> + jg LABEL(nibble_ashr_14) /* cross page boundary */
> +
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $14, %xmm3
> + pslldq $2, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> + jmp LABEL(loop_ashr_14)
> +
> + .p2align 4
> +LABEL(nibble_ashr_14):
> + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> + pmovmskb %xmm0, %edx
> + test $0xc000, %edx
> + jnz LABEL(ashr_14_exittail)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp $2, %r11
> + jbe LABEL(ashr_14_exittail)
> +# endif
> +
> + pxor %xmm0, %xmm0
> + sub $0x1000, %r10
> + jmp LABEL(gobble_ashr_14)
> +
> + .p2align 4
> +LABEL(ashr_14_exittail):
> + movdqa (%rsi, %rcx), %xmm1
> + psrldq $14, %xmm0
> + psrldq $14, %xmm3
> + jmp LABEL(aftertail)
> +
> +/*
> + * The following cases will be handled by ashr_15
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
> + */
> + .p2align 4
> +LABEL(ashr_15):
> + pxor %xmm0, %xmm0
> + movdqa (%rdi), %xmm2
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0
> + pslldq $1, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> +
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + pxor %xmm0, %xmm0
> + mov $16, %rcx /* index for loads */
> + mov $15, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 15(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> +
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + .p2align 4
> +LABEL(loop_ashr_15):
> + add $16, %r10
> + jg LABEL(nibble_ashr_15)
> +
> +LABEL(gobble_ashr_15):
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $15, %xmm3
> + pslldq $1, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> +
> + add $16, %r10
> + jg LABEL(nibble_ashr_15) /* cross page boundary */
> +
> + movdqa (%rsi, %rcx), %xmm1
> + movdqa (%rdi, %rcx), %xmm2
> + movdqa %xmm2, %xmm4
> +
> + psrldq $15, %xmm3
> + pslldq $1, %xmm2
> + por %xmm3, %xmm2 /* merge into one 16byte value */
> +
> + TOLOWER (%xmm1, %xmm2)
> +
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm2, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx
> + jnz LABEL(exit)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rcx
> + movdqa %xmm4, %xmm3
> + jmp LABEL(loop_ashr_15)
> +
> + .p2align 4
> +LABEL(nibble_ashr_15):
> + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> + pmovmskb %xmm0, %edx
> + test $0x8000, %edx
> + jnz LABEL(ashr_15_exittail)
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmpq $1, %r11
> + jbe LABEL(ashr_15_exittail)
> +# endif
> +
> + pxor %xmm0, %xmm0
> + sub $0x1000, %r10
> + jmp LABEL(gobble_ashr_15)
> +
> + .p2align 4
> +LABEL(ashr_15_exittail):
> + movdqa (%rsi, %rcx), %xmm1
> + psrldq $15, %xmm3
> + psrldq $15, %xmm0
> +
> + .p2align 4
> +LABEL(aftertail):
> + TOLOWER (%xmm1, %xmm3)
> + pcmpeqb %xmm3, %xmm1
> + psubb %xmm0, %xmm1
> + pmovmskb %xmm1, %edx
> + not %edx
> +
> + .p2align 4
> +LABEL(exit):
> + lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
> +LABEL(less32bytes):
> + lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
> + lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
> + test %r8d, %r8d
> + jz LABEL(ret)
> + xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
> +
> + .p2align 4
> +LABEL(ret):
> +LABEL(less16bytes):
> + bsf %rdx, %rdx /* find and store bit index in %rdx */
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub %rdx, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + movzbl (%rsi, %rdx), %ecx
> + movzbl (%rdi, %rdx), %eax
> +
> +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
> + movl (%rdx,%rcx,4), %ecx
> + movl (%rdx,%rax,4), %eax
> +# endif
> +
> + sub %ecx, %eax
> + ret
> +
> +LABEL(strcmp_exitz):
> + xor %eax, %eax
> + ret
> +
> + .p2align 4
> +LABEL(Byte0):
> + movzbl (%rsi), %ecx
> + movzbl (%rdi), %eax
> +
> +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
> + movl (%rdx,%rcx,4), %ecx
> + movl (%rdx,%rax,4), %eax
> +# endif
> +
> + sub %ecx, %eax
> + ret
> +END (STRCMP)
> +
> + .section .rodata,"a",@progbits
> + .p2align 3
> +LABEL(unaligned_table):
> + .int LABEL(ashr_1) - LABEL(unaligned_table)
> + .int LABEL(ashr_2) - LABEL(unaligned_table)
> + .int LABEL(ashr_3) - LABEL(unaligned_table)
> + .int LABEL(ashr_4) - LABEL(unaligned_table)
> + .int LABEL(ashr_5) - LABEL(unaligned_table)
> + .int LABEL(ashr_6) - LABEL(unaligned_table)
> + .int LABEL(ashr_7) - LABEL(unaligned_table)
> + .int LABEL(ashr_8) - LABEL(unaligned_table)
> + .int LABEL(ashr_9) - LABEL(unaligned_table)
> + .int LABEL(ashr_10) - LABEL(unaligned_table)
> + .int LABEL(ashr_11) - LABEL(unaligned_table)
> + .int LABEL(ashr_12) - LABEL(unaligned_table)
> + .int LABEL(ashr_13) - LABEL(unaligned_table)
> + .int LABEL(ashr_14) - LABEL(unaligned_table)
> + .int LABEL(ashr_15) - LABEL(unaligned_table)
> + .int LABEL(ashr_0) - LABEL(unaligned_table)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-sse2.S b/sysdeps/x86_64/multiarch/strncase_l-sse2.S
> index 0ca4c836b2..fd8ad07450 100644
> --- a/sysdeps/x86_64/multiarch/strncase_l-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strncase_l-sse2.S
> @@ -16,8 +16,5 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#define STRCMP __strncasecmp_l_sse2
> -#define NO_NOLOCALE_ALIAS
> #define USE_AS_STRNCASECMP_L
> -#define __strncasecmp __strncasecmp_sse2
> -#include <sysdeps/x86_64/strcmp.S>
> +#include "strcmp-sse2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncmp-sse2.S b/sysdeps/x86_64/multiarch/strncmp-sse2.S
> index e3ba94f926..2152b8dc3d 100644
> --- a/sysdeps/x86_64/multiarch/strncmp-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strncmp-sse2.S
> @@ -16,15 +16,5 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <sysdep.h>
> -
> -#if IS_IN (libc)
> -# define STRCMP __strncmp_sse2
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strcmp)
> -#else
> -# define STRCMP strncmp
> -#endif
> -
> #define USE_AS_STRNCMP
> -#include <sysdeps/x86_64/strcmp.S>
> +#include "strcmp-sse2.S"
> diff --git a/sysdeps/x86_64/strcasecmp_l.S b/sysdeps/x86_64/strcasecmp_l.S
> index 5456b3a49e..84fd7fdfd3 100644
> --- a/sysdeps/x86_64/strcasecmp_l.S
> +++ b/sysdeps/x86_64/strcasecmp_l.S
> @@ -1,6 +1,11 @@
> -#define STRCMP __strcasecmp_l
> -#define USE_AS_STRCASECMP_L
> -#include "strcmp.S"
> +/* Symbols = __strcasecmp_l and __strcasecmp. */
> +
> +#include "multiarch/strcasecmp_l-sse2.S"
> +
> +libc_hidden_builtin_def (__strcasecmp_l)
>
> weak_alias (__strcasecmp_l, strcasecmp_l)
> libc_hidden_def (strcasecmp_l)
> +
> +weak_alias (__strcasecmp, strcasecmp)
> +libc_hidden_def (__strcasecmp)
> diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
> index c38dc627f9..19e54bd3a7 100644
> --- a/sysdeps/x86_64/strcmp.S
> +++ b/sysdeps/x86_64/strcmp.S
> @@ -16,2148 +16,7 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <sysdep.h>
> -#include "asm-syntax.h"
> +/* Symbol = strcmp. */
>
> -#undef UPDATE_STRNCMP_COUNTER
> -
> -#ifndef LABEL
> -#define LABEL(l) L(l)
> -#endif
> -
> -#ifdef USE_AS_STRNCMP
> -/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
> - if the new counter > the old one or is 0. */
> -# define UPDATE_STRNCMP_COUNTER \
> - /* calculate left number to compare */ \
> - lea -16(%rcx, %r11), %r9; \
> - cmp %r9, %r11; \
> - jb LABEL(strcmp_exitz); \
> - test %r9, %r9; \
> - je LABEL(strcmp_exitz); \
> - mov %r9, %r11
> -
> -#elif defined USE_AS_STRCASECMP_L
> -# include "locale-defines.h"
> -
> -# define UPDATE_STRNCMP_COUNTER
> -#elif defined USE_AS_STRNCASECMP_L
> -# include "locale-defines.h"
> -
> -# define UPDATE_STRNCMP_COUNTER \
> - /* calculate left number to compare */ \
> - lea -16(%rcx, %r11), %r9; \
> - cmp %r9, %r11; \
> - jb LABEL(strcmp_exitz); \
> - test %r9, %r9; \
> - je LABEL(strcmp_exitz); \
> - mov %r9, %r11
> -#else
> -# define UPDATE_STRNCMP_COUNTER
> -# ifndef STRCMP
> -# define STRCMP strcmp
> -# endif
> -#endif
> -
> - .text
> -#ifdef USE_AS_STRCASECMP_L
> -# ifndef ENTRY2
> -# define ENTRY2(name) ENTRY (name)
> -# define END2(name) END (name)
> -# endif
> -
> -ENTRY2 (__strcasecmp)
> - movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> - mov %fs:(%rax),%RDX_LP
> -
> - /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> - .p2align 4
> -END2 (__strcasecmp)
> -# ifndef NO_NOLOCALE_ALIAS
> -weak_alias (__strcasecmp, strcasecmp)
> -libc_hidden_def (__strcasecmp)
> -# endif
> - /* FALLTHROUGH to strcasecmp_l. */
> -#elif defined USE_AS_STRNCASECMP_L
> -# ifndef ENTRY2
> -# define ENTRY2(name) ENTRY (name)
> -# define END2(name) END (name)
> -# endif
> -
> -ENTRY2 (__strncasecmp)
> - movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> - mov %fs:(%rax),%RCX_LP
> -
> - /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> - .p2align 4
> -END2 (__strncasecmp)
> -# ifndef NO_NOLOCALE_ALIAS
> -weak_alias (__strncasecmp, strncasecmp)
> -libc_hidden_def (__strncasecmp)
> -# endif
> - /* FALLTHROUGH to strncasecmp_l. */
> -#endif
> -
> -ENTRY (STRCMP)
> -#ifdef USE_AS_STRCASECMP_L
> - /* We have to fall back on the C implementation for locales
> - with encodings not matching ASCII for single bytes. */
> -# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> - mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
> -# else
> - mov (%rdx), %RAX_LP
> -# endif
> - testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
> - jne __strcasecmp_l_nonascii
> -#elif defined USE_AS_STRNCASECMP_L
> - /* We have to fall back on the C implementation for locales
> - with encodings not matching ASCII for single bytes. */
> -# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> - mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
> -# else
> - mov (%rcx), %RAX_LP
> -# endif
> - testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
> - jne __strncasecmp_l_nonascii
> -#endif
> -
> -/*
> - * This implementation uses SSE to compare up to 16 bytes at a time.
> - */
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - test %RDX_LP, %RDX_LP
> - je LABEL(strcmp_exitz)
> - cmp $1, %RDX_LP
> - je LABEL(Byte0)
> - mov %RDX_LP, %R11_LP
> -#endif
> - mov %esi, %ecx
> - mov %edi, %eax
> -/* Use 64bit AND here to avoid long NOP padding. */
> - and $0x3f, %rcx /* rsi alignment in cache line */
> - and $0x3f, %rax /* rdi alignment in cache line */
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> - .section .rodata.cst16,"aM",@progbits,16
> - .align 16
> -.Llcase_min:
> - .quad 0x3f3f3f3f3f3f3f3f
> - .quad 0x3f3f3f3f3f3f3f3f
> -.Llcase_max:
> - .quad 0x9999999999999999
> - .quad 0x9999999999999999
> -.Lcase_add:
> - .quad 0x2020202020202020
> - .quad 0x2020202020202020
> - .previous
> - movdqa .Llcase_min(%rip), %xmm5
> -# define LCASE_MIN_reg %xmm5
> - movdqa .Llcase_max(%rip), %xmm6
> -# define LCASE_MAX_reg %xmm6
> - movdqa .Lcase_add(%rip), %xmm7
> -# define CASE_ADD_reg %xmm7
> -#endif
> - cmp $0x30, %ecx
> - ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */
> - cmp $0x30, %eax
> - ja LABEL(crosscache) /* rdi: 16-byte load will cross cache line */
> - movlpd (%rdi), %xmm1
> - movlpd (%rsi), %xmm2
> - movhpd 8(%rdi), %xmm1
> - movhpd 8(%rsi), %xmm2
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> -# define TOLOWER(reg1, reg2) \
> - movdqa LCASE_MIN_reg, %xmm8; \
> - movdqa LCASE_MIN_reg, %xmm9; \
> - paddb reg1, %xmm8; \
> - paddb reg2, %xmm9; \
> - pcmpgtb LCASE_MAX_reg, %xmm8; \
> - pcmpgtb LCASE_MAX_reg, %xmm9; \
> - pandn CASE_ADD_reg, %xmm8; \
> - pandn CASE_ADD_reg, %xmm9; \
> - paddb %xmm8, reg1; \
> - paddb %xmm9, reg2
> - TOLOWER (%xmm1, %xmm2)
> -#else
> -# define TOLOWER(reg1, reg2)
> -#endif
> - pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
> - pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> - pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
> - jnz LABEL(less16bytes) /* If not, find different value or null char */
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz) /* finish comparision */
> -#endif
> - add $16, %rsi /* prepare to search next 16 bytes */
> - add $16, %rdi /* prepare to search next 16 bytes */
> -
> - /*
> - * Determine source and destination string offsets from 16-byte alignment.
> - * Use relative offset difference between the two to determine which case
> - * below to use.
> - */
> - .p2align 4
> -LABEL(crosscache):
> - and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
> - and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
> - mov $0xffff, %edx /* for equivalent offset */
> - xor %r8d, %r8d
> - and $0xf, %ecx /* offset of rsi */
> - and $0xf, %eax /* offset of rdi */
> - cmp %eax, %ecx
> - je LABEL(ashr_0) /* rsi and rdi relative offset same */
> - ja LABEL(bigger)
> - mov %edx, %r8d /* r8d is offset flag for exit tail */
> - xchg %ecx, %eax
> - xchg %rsi, %rdi
> -LABEL(bigger):
> - lea 15(%rax), %r9
> - sub %rcx, %r9
> - lea LABEL(unaligned_table)(%rip), %r10
> - movslq (%r10, %r9,4), %r9
> - lea (%r10, %r9), %r10
> - _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
> -
> -/*
> - * The following cases will be handled by ashr_0
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(0~15) n(0~15) 15(15+ n-n) ashr_0
> - */
> - .p2align 4
> -LABEL(ashr_0):
> -
> - movdqa (%rsi), %xmm1
> - pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
> - pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
> -#else
> - movdqa (%rdi), %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
> -#endif
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %r9d
> - shr %cl, %edx /* adjust 0xffff for offset */
> - shr %cl, %r9d /* adjust for 16-byte offset */
> - sub %r9d, %edx
> - /*
> - * edx must be the same with r9d if in left byte (16-rcx) is equal to
> - * the start from (16-rax) and no null char was seen.
> - */
> - jne LABEL(less32bytes) /* mismatch or null char */
> - UPDATE_STRNCMP_COUNTER
> - mov $16, %rcx
> - mov $16, %r9
> - pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */
> -
> - /*
> - * Now both strings are aligned at 16-byte boundary. Loop over strings
> - * checking 32-bytes per iteration.
> - */
> - .p2align 4
> -LABEL(loop_ashr_0):
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit) /* mismatch or null char seen */
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rcx
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rcx
> - jmp LABEL(loop_ashr_0)
> -
> -/*
> - * The following cases will be handled by ashr_1
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(15) n -15 0(15 +(n-15) - n) ashr_1
> - */
> - .p2align 4
> -LABEL(ashr_1):
> - pxor %xmm0, %xmm0
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> - pslldq $15, %xmm2 /* shift first string to align with second */
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
> - psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx /* adjust 0xffff for offset */
> - shr %cl, %r9d /* adjust for 16-byte offset */
> - sub %r9d, %edx
> - jnz LABEL(less32bytes) /* mismatch or null char seen */
> - movdqa (%rdi), %xmm3
> - UPDATE_STRNCMP_COUNTER
> -
> - pxor %xmm0, %xmm0
> - mov $16, %rcx /* index for loads*/
> - mov $1, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 1(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - .p2align 4
> -LABEL(loop_ashr_1):
> - add $16, %r10
> - jg LABEL(nibble_ashr_1) /* cross page boundary */
> -
> -LABEL(gobble_ashr_1):
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4 /* store for next cycle */
> -
> - psrldq $1, %xmm3
> - pslldq $15, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> -
> - add $16, %r10
> - jg LABEL(nibble_ashr_1) /* cross page boundary */
> -
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4 /* store for next cycle */
> -
> - psrldq $1, %xmm3
> - pslldq $15, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> - jmp LABEL(loop_ashr_1)
> -
> - /*
> - * Nibble avoids loads across page boundary. This is to avoid a potential
> - * access into unmapped memory.
> - */
> - .p2align 4
> -LABEL(nibble_ashr_1):
> - pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/
> - pmovmskb %xmm0, %edx
> - test $0xfffe, %edx
> - jnz LABEL(ashr_1_exittail) /* find null char*/
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp $15, %r11
> - jbe LABEL(ashr_1_exittail)
> -#endif
> -
> - pxor %xmm0, %xmm0
> - sub $0x1000, %r10 /* substract 4K from %r10 */
> - jmp LABEL(gobble_ashr_1)
> -
> - /*
> - * Once find null char, determine if there is a string mismatch
> - * before the null char.
> - */
> - .p2align 4
> -LABEL(ashr_1_exittail):
> - movdqa (%rsi, %rcx), %xmm1
> - psrldq $1, %xmm0
> - psrldq $1, %xmm3
> - jmp LABEL(aftertail)
> -
> -/*
> - * The following cases will be handled by ashr_2
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
> - */
> - .p2align 4
> -LABEL(ashr_2):
> - pxor %xmm0, %xmm0
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, %xmm0
> - pslldq $14, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> - UPDATE_STRNCMP_COUNTER
> -
> - pxor %xmm0, %xmm0
> - mov $16, %rcx /* index for loads */
> - mov $2, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 2(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - .p2align 4
> -LABEL(loop_ashr_2):
> - add $16, %r10
> - jg LABEL(nibble_ashr_2)
> -
> -LABEL(gobble_ashr_2):
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $2, %xmm3
> - pslldq $14, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> -
> - add $16, %r10
> - jg LABEL(nibble_ashr_2) /* cross page boundary */
> -
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $2, %xmm3
> - pslldq $14, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> - jmp LABEL(loop_ashr_2)
> -
> - .p2align 4
> -LABEL(nibble_ashr_2):
> - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> - pmovmskb %xmm0, %edx
> - test $0xfffc, %edx
> - jnz LABEL(ashr_2_exittail)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp $14, %r11
> - jbe LABEL(ashr_2_exittail)
> -#endif
> -
> - pxor %xmm0, %xmm0
> - sub $0x1000, %r10
> - jmp LABEL(gobble_ashr_2)
> -
> - .p2align 4
> -LABEL(ashr_2_exittail):
> - movdqa (%rsi, %rcx), %xmm1
> - psrldq $2, %xmm0
> - psrldq $2, %xmm3
> - jmp LABEL(aftertail)
> -
> -/*
> - * The following cases will be handled by ashr_3
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
> - */
> - .p2align 4
> -LABEL(ashr_3):
> - pxor %xmm0, %xmm0
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, %xmm0
> - pslldq $13, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - pxor %xmm0, %xmm0
> - mov $16, %rcx /* index for loads */
> - mov $3, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 3(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - .p2align 4
> -LABEL(loop_ashr_3):
> - add $16, %r10
> - jg LABEL(nibble_ashr_3)
> -
> -LABEL(gobble_ashr_3):
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $3, %xmm3
> - pslldq $13, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> -
> - add $16, %r10
> - jg LABEL(nibble_ashr_3) /* cross page boundary */
> -
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $3, %xmm3
> - pslldq $13, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> - jmp LABEL(loop_ashr_3)
> -
> - .p2align 4
> -LABEL(nibble_ashr_3):
> - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> - pmovmskb %xmm0, %edx
> - test $0xfff8, %edx
> - jnz LABEL(ashr_3_exittail)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp $13, %r11
> - jbe LABEL(ashr_3_exittail)
> -#endif
> -
> - pxor %xmm0, %xmm0
> - sub $0x1000, %r10
> - jmp LABEL(gobble_ashr_3)
> -
> - .p2align 4
> -LABEL(ashr_3_exittail):
> - movdqa (%rsi, %rcx), %xmm1
> - psrldq $3, %xmm0
> - psrldq $3, %xmm3
> - jmp LABEL(aftertail)
> -
> -/*
> - * The following cases will be handled by ashr_4
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
> - */
> - .p2align 4
> -LABEL(ashr_4):
> - pxor %xmm0, %xmm0
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, %xmm0
> - pslldq $12, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - pxor %xmm0, %xmm0
> - mov $16, %rcx /* index for loads */
> - mov $4, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 4(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - .p2align 4
> -LABEL(loop_ashr_4):
> - add $16, %r10
> - jg LABEL(nibble_ashr_4)
> -
> -LABEL(gobble_ashr_4):
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $4, %xmm3
> - pslldq $12, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> -
> - add $16, %r10
> - jg LABEL(nibble_ashr_4) /* cross page boundary */
> -
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $4, %xmm3
> - pslldq $12, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> - jmp LABEL(loop_ashr_4)
> -
> - .p2align 4
> -LABEL(nibble_ashr_4):
> - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> - pmovmskb %xmm0, %edx
> - test $0xfff0, %edx
> - jnz LABEL(ashr_4_exittail)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp $12, %r11
> - jbe LABEL(ashr_4_exittail)
> -#endif
> -
> - pxor %xmm0, %xmm0
> - sub $0x1000, %r10
> - jmp LABEL(gobble_ashr_4)
> -
> - .p2align 4
> -LABEL(ashr_4_exittail):
> - movdqa (%rsi, %rcx), %xmm1
> - psrldq $4, %xmm0
> - psrldq $4, %xmm3
> - jmp LABEL(aftertail)
> -
> -/*
> - * The following cases will be handled by ashr_5
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
> - */
> - .p2align 4
> -LABEL(ashr_5):
> - pxor %xmm0, %xmm0
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, %xmm0
> - pslldq $11, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - pxor %xmm0, %xmm0
> - mov $16, %rcx /* index for loads */
> - mov $5, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 5(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - .p2align 4
> -LABEL(loop_ashr_5):
> - add $16, %r10
> - jg LABEL(nibble_ashr_5)
> -
> -LABEL(gobble_ashr_5):
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $5, %xmm3
> - pslldq $11, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> -
> - add $16, %r10
> - jg LABEL(nibble_ashr_5) /* cross page boundary */
> -
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $5, %xmm3
> - pslldq $11, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> - jmp LABEL(loop_ashr_5)
> -
> - .p2align 4
> -LABEL(nibble_ashr_5):
> - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> - pmovmskb %xmm0, %edx
> - test $0xffe0, %edx
> - jnz LABEL(ashr_5_exittail)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp $11, %r11
> - jbe LABEL(ashr_5_exittail)
> -#endif
> -
> - pxor %xmm0, %xmm0
> - sub $0x1000, %r10
> - jmp LABEL(gobble_ashr_5)
> -
> - .p2align 4
> -LABEL(ashr_5_exittail):
> - movdqa (%rsi, %rcx), %xmm1
> - psrldq $5, %xmm0
> - psrldq $5, %xmm3
> - jmp LABEL(aftertail)
> -
> -/*
> - * The following cases will be handled by ashr_6
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
> - */
> - .p2align 4
> -LABEL(ashr_6):
> - pxor %xmm0, %xmm0
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, %xmm0
> - pslldq $10, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - pxor %xmm0, %xmm0
> - mov $16, %rcx /* index for loads */
> - mov $6, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 6(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - .p2align 4
> -LABEL(loop_ashr_6):
> - add $16, %r10
> - jg LABEL(nibble_ashr_6)
> -
> -LABEL(gobble_ashr_6):
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $6, %xmm3
> - pslldq $10, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> -
> - add $16, %r10
> - jg LABEL(nibble_ashr_6) /* cross page boundary */
> -
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $6, %xmm3
> - pslldq $10, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> - jmp LABEL(loop_ashr_6)
> -
> - .p2align 4
> -LABEL(nibble_ashr_6):
> - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> - pmovmskb %xmm0, %edx
> - test $0xffc0, %edx
> - jnz LABEL(ashr_6_exittail)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp $10, %r11
> - jbe LABEL(ashr_6_exittail)
> -#endif
> -
> - pxor %xmm0, %xmm0
> - sub $0x1000, %r10
> - jmp LABEL(gobble_ashr_6)
> -
> - .p2align 4
> -LABEL(ashr_6_exittail):
> - movdqa (%rsi, %rcx), %xmm1
> - psrldq $6, %xmm0
> - psrldq $6, %xmm3
> - jmp LABEL(aftertail)
> -
> -/*
> - * The following cases will be handled by ashr_7
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
> - */
> - .p2align 4
> -LABEL(ashr_7):
> - pxor %xmm0, %xmm0
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, %xmm0
> - pslldq $9, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - pxor %xmm0, %xmm0
> - mov $16, %rcx /* index for loads */
> - mov $7, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 7(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - .p2align 4
> -LABEL(loop_ashr_7):
> - add $16, %r10
> - jg LABEL(nibble_ashr_7)
> -
> -LABEL(gobble_ashr_7):
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $7, %xmm3
> - pslldq $9, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> -
> - add $16, %r10
> - jg LABEL(nibble_ashr_7) /* cross page boundary */
> -
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $7, %xmm3
> - pslldq $9, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> - jmp LABEL(loop_ashr_7)
> -
> - .p2align 4
> -LABEL(nibble_ashr_7):
> - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> - pmovmskb %xmm0, %edx
> - test $0xff80, %edx
> - jnz LABEL(ashr_7_exittail)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp $9, %r11
> - jbe LABEL(ashr_7_exittail)
> -#endif
> -
> - pxor %xmm0, %xmm0
> - sub $0x1000, %r10
> - jmp LABEL(gobble_ashr_7)
> -
> - .p2align 4
> -LABEL(ashr_7_exittail):
> - movdqa (%rsi, %rcx), %xmm1
> - psrldq $7, %xmm0
> - psrldq $7, %xmm3
> - jmp LABEL(aftertail)
> -
> -/*
> - * The following cases will be handled by ashr_8
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
> - */
> - .p2align 4
> -LABEL(ashr_8):
> - pxor %xmm0, %xmm0
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, %xmm0
> - pslldq $8, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - pxor %xmm0, %xmm0
> - mov $16, %rcx /* index for loads */
> - mov $8, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 8(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - .p2align 4
> -LABEL(loop_ashr_8):
> - add $16, %r10
> - jg LABEL(nibble_ashr_8)
> -
> -LABEL(gobble_ashr_8):
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $8, %xmm3
> - pslldq $8, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> -
> - add $16, %r10
> - jg LABEL(nibble_ashr_8) /* cross page boundary */
> -
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $8, %xmm3
> - pslldq $8, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> - jmp LABEL(loop_ashr_8)
> -
> - .p2align 4
> -LABEL(nibble_ashr_8):
> - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> - pmovmskb %xmm0, %edx
> - test $0xff00, %edx
> - jnz LABEL(ashr_8_exittail)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp $8, %r11
> - jbe LABEL(ashr_8_exittail)
> -#endif
> -
> - pxor %xmm0, %xmm0
> - sub $0x1000, %r10
> - jmp LABEL(gobble_ashr_8)
> -
> - .p2align 4
> -LABEL(ashr_8_exittail):
> - movdqa (%rsi, %rcx), %xmm1
> - psrldq $8, %xmm0
> - psrldq $8, %xmm3
> - jmp LABEL(aftertail)
> -
> -/*
> - * The following cases will be handled by ashr_9
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
> - */
> - .p2align 4
> -LABEL(ashr_9):
> - pxor %xmm0, %xmm0
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, %xmm0
> - pslldq $7, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - pxor %xmm0, %xmm0
> - mov $16, %rcx /* index for loads */
> - mov $9, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 9(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - .p2align 4
> -LABEL(loop_ashr_9):
> - add $16, %r10
> - jg LABEL(nibble_ashr_9)
> -
> -LABEL(gobble_ashr_9):
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $9, %xmm3
> - pslldq $7, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> -
> - add $16, %r10
> - jg LABEL(nibble_ashr_9) /* cross page boundary */
> -
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $9, %xmm3
> - pslldq $7, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3 /* store for next cycle */
> - jmp LABEL(loop_ashr_9)
> -
> - .p2align 4
> -LABEL(nibble_ashr_9):
> - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> - pmovmskb %xmm0, %edx
> - test $0xfe00, %edx
> - jnz LABEL(ashr_9_exittail)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp $7, %r11
> - jbe LABEL(ashr_9_exittail)
> -#endif
> -
> - pxor %xmm0, %xmm0
> - sub $0x1000, %r10
> - jmp LABEL(gobble_ashr_9)
> -
> - .p2align 4
> -LABEL(ashr_9_exittail):
> - movdqa (%rsi, %rcx), %xmm1
> - psrldq $9, %xmm0
> - psrldq $9, %xmm3
> - jmp LABEL(aftertail)
> -
> -/*
> - * The following cases will be handled by ashr_10
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
> - */
> - .p2align 4
> -LABEL(ashr_10):
> - pxor %xmm0, %xmm0
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, %xmm0
> - pslldq $6, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - pxor %xmm0, %xmm0
> - mov $16, %rcx /* index for loads */
> - mov $10, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 10(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - .p2align 4
> -LABEL(loop_ashr_10):
> - add $16, %r10
> - jg LABEL(nibble_ashr_10)
> -
> -LABEL(gobble_ashr_10):
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $10, %xmm3
> - pslldq $6, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> -
> - add $16, %r10
> - jg LABEL(nibble_ashr_10) /* cross page boundary */
> -
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $10, %xmm3
> - pslldq $6, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> - jmp LABEL(loop_ashr_10)
> -
> - .p2align 4
> -LABEL(nibble_ashr_10):
> - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> - pmovmskb %xmm0, %edx
> - test $0xfc00, %edx
> - jnz LABEL(ashr_10_exittail)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp $6, %r11
> - jbe LABEL(ashr_10_exittail)
> -#endif
> -
> - pxor %xmm0, %xmm0
> - sub $0x1000, %r10
> - jmp LABEL(gobble_ashr_10)
> -
> - .p2align 4
> -LABEL(ashr_10_exittail):
> - movdqa (%rsi, %rcx), %xmm1
> - psrldq $10, %xmm0
> - psrldq $10, %xmm3
> - jmp LABEL(aftertail)
> -
> -/*
> - * The following cases will be handled by ashr_11
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
> - */
> - .p2align 4
> -LABEL(ashr_11):
> - pxor %xmm0, %xmm0
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, %xmm0
> - pslldq $5, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - pxor %xmm0, %xmm0
> - mov $16, %rcx /* index for loads */
> - mov $11, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 11(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - .p2align 4
> -LABEL(loop_ashr_11):
> - add $16, %r10
> - jg LABEL(nibble_ashr_11)
> -
> -LABEL(gobble_ashr_11):
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $11, %xmm3
> - pslldq $5, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> -
> - add $16, %r10
> - jg LABEL(nibble_ashr_11) /* cross page boundary */
> -
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $11, %xmm3
> - pslldq $5, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> - jmp LABEL(loop_ashr_11)
> -
> - .p2align 4
> -LABEL(nibble_ashr_11):
> - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> - pmovmskb %xmm0, %edx
> - test $0xf800, %edx
> - jnz LABEL(ashr_11_exittail)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp $5, %r11
> - jbe LABEL(ashr_11_exittail)
> -#endif
> -
> - pxor %xmm0, %xmm0
> - sub $0x1000, %r10
> - jmp LABEL(gobble_ashr_11)
> -
> - .p2align 4
> -LABEL(ashr_11_exittail):
> - movdqa (%rsi, %rcx), %xmm1
> - psrldq $11, %xmm0
> - psrldq $11, %xmm3
> - jmp LABEL(aftertail)
> -
> -/*
> - * The following cases will be handled by ashr_12
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
> - */
> - .p2align 4
> -LABEL(ashr_12):
> - pxor %xmm0, %xmm0
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, %xmm0
> - pslldq $4, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - pxor %xmm0, %xmm0
> - mov $16, %rcx /* index for loads */
> - mov $12, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 12(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - .p2align 4
> -LABEL(loop_ashr_12):
> - add $16, %r10
> - jg LABEL(nibble_ashr_12)
> -
> -LABEL(gobble_ashr_12):
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $12, %xmm3
> - pslldq $4, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> -
> - add $16, %r10
> - jg LABEL(nibble_ashr_12) /* cross page boundary */
> -
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $12, %xmm3
> - pslldq $4, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> - jmp LABEL(loop_ashr_12)
> -
> - .p2align 4
> -LABEL(nibble_ashr_12):
> - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> - pmovmskb %xmm0, %edx
> - test $0xf000, %edx
> - jnz LABEL(ashr_12_exittail)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp $4, %r11
> - jbe LABEL(ashr_12_exittail)
> -#endif
> -
> - pxor %xmm0, %xmm0
> - sub $0x1000, %r10
> - jmp LABEL(gobble_ashr_12)
> -
> - .p2align 4
> -LABEL(ashr_12_exittail):
> - movdqa (%rsi, %rcx), %xmm1
> - psrldq $12, %xmm0
> - psrldq $12, %xmm3
> - jmp LABEL(aftertail)
> -
> -/*
> - * The following cases will be handled by ashr_13
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
> - */
> - .p2align 4
> -LABEL(ashr_13):
> - pxor %xmm0, %xmm0
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, %xmm0
> - pslldq $3, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - pxor %xmm0, %xmm0
> - mov $16, %rcx /* index for loads */
> - mov $13, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 13(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - .p2align 4
> -LABEL(loop_ashr_13):
> - add $16, %r10
> - jg LABEL(nibble_ashr_13)
> -
> -LABEL(gobble_ashr_13):
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $13, %xmm3
> - pslldq $3, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> -
> - add $16, %r10
> - jg LABEL(nibble_ashr_13) /* cross page boundary */
> -
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $13, %xmm3
> - pslldq $3, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> - jmp LABEL(loop_ashr_13)
> -
> - .p2align 4
> -LABEL(nibble_ashr_13):
> - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> - pmovmskb %xmm0, %edx
> - test $0xe000, %edx
> - jnz LABEL(ashr_13_exittail)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp $3, %r11
> - jbe LABEL(ashr_13_exittail)
> -#endif
> -
> - pxor %xmm0, %xmm0
> - sub $0x1000, %r10
> - jmp LABEL(gobble_ashr_13)
> -
> - .p2align 4
> -LABEL(ashr_13_exittail):
> - movdqa (%rsi, %rcx), %xmm1
> - psrldq $13, %xmm0
> - psrldq $13, %xmm3
> - jmp LABEL(aftertail)
> -
> -/*
> - * The following cases will be handled by ashr_14
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
> - */
> - .p2align 4
> -LABEL(ashr_14):
> - pxor %xmm0, %xmm0
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, %xmm0
> - pslldq $2, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - pxor %xmm0, %xmm0
> - mov $16, %rcx /* index for loads */
> - mov $14, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 14(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - .p2align 4
> -LABEL(loop_ashr_14):
> - add $16, %r10
> - jg LABEL(nibble_ashr_14)
> -
> -LABEL(gobble_ashr_14):
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $14, %xmm3
> - pslldq $2, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> -
> - add $16, %r10
> - jg LABEL(nibble_ashr_14) /* cross page boundary */
> -
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $14, %xmm3
> - pslldq $2, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP | defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> - jmp LABEL(loop_ashr_14)
> -
> - .p2align 4
> -LABEL(nibble_ashr_14):
> - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> - pmovmskb %xmm0, %edx
> - test $0xc000, %edx
> - jnz LABEL(ashr_14_exittail)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp $2, %r11
> - jbe LABEL(ashr_14_exittail)
> -#endif
> -
> - pxor %xmm0, %xmm0
> - sub $0x1000, %r10
> - jmp LABEL(gobble_ashr_14)
> -
> - .p2align 4
> -LABEL(ashr_14_exittail):
> - movdqa (%rsi, %rcx), %xmm1
> - psrldq $14, %xmm0
> - psrldq $14, %xmm3
> - jmp LABEL(aftertail)
> -
> -/*
> - * The following cases will be handled by ashr_15
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
> - */
> - .p2align 4
> -LABEL(ashr_15):
> - pxor %xmm0, %xmm0
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, %xmm0
> - pslldq $1, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> -
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - pxor %xmm0, %xmm0
> - mov $16, %rcx /* index for loads */
> - mov $15, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 15(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> -
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - .p2align 4
> -LABEL(loop_ashr_15):
> - add $16, %r10
> - jg LABEL(nibble_ashr_15)
> -
> -LABEL(gobble_ashr_15):
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $15, %xmm3
> - pslldq $1, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> -
> - add $16, %r10
> - jg LABEL(nibble_ashr_15) /* cross page boundary */
> -
> - movdqa (%rsi, %rcx), %xmm1
> - movdqa (%rdi, %rcx), %xmm2
> - movdqa %xmm2, %xmm4
> -
> - psrldq $15, %xmm3
> - pslldq $1, %xmm2
> - por %xmm3, %xmm2 /* merge into one 16byte value */
> -
> - TOLOWER (%xmm1, %xmm2)
> -
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm2, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx
> - jnz LABEL(exit)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rcx
> - movdqa %xmm4, %xmm3
> - jmp LABEL(loop_ashr_15)
> -
> - .p2align 4
> -LABEL(nibble_ashr_15):
> - pcmpeqb %xmm3, %xmm0 /* check nibble for null char */
> - pmovmskb %xmm0, %edx
> - test $0x8000, %edx
> - jnz LABEL(ashr_15_exittail)
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmpq $1, %r11
> - jbe LABEL(ashr_15_exittail)
> -#endif
> -
> - pxor %xmm0, %xmm0
> - sub $0x1000, %r10
> - jmp LABEL(gobble_ashr_15)
> -
> - .p2align 4
> -LABEL(ashr_15_exittail):
> - movdqa (%rsi, %rcx), %xmm1
> - psrldq $15, %xmm3
> - psrldq $15, %xmm0
> -
> - .p2align 4
> -LABEL(aftertail):
> - TOLOWER (%xmm1, %xmm3)
> - pcmpeqb %xmm3, %xmm1
> - psubb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - not %edx
> -
> - .p2align 4
> -LABEL(exit):
> - lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */
> -LABEL(less32bytes):
> - lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
> - lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
> - test %r8d, %r8d
> - jz LABEL(ret)
> - xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
> -
> - .p2align 4
> -LABEL(ret):
> -LABEL(less16bytes):
> - bsf %rdx, %rdx /* find and store bit index in %rdx */
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub %rdx, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - movzbl (%rsi, %rdx), %ecx
> - movzbl (%rdi, %rdx), %eax
> -
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
> - movl (%rdx,%rcx,4), %ecx
> - movl (%rdx,%rax,4), %eax
> -#endif
> -
> - sub %ecx, %eax
> - ret
> -
> -LABEL(strcmp_exitz):
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -LABEL(Byte0):
> - movzbl (%rsi), %ecx
> - movzbl (%rdi), %eax
> -
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
> - movl (%rdx,%rcx,4), %ecx
> - movl (%rdx,%rax,4), %eax
> -#endif
> -
> - sub %ecx, %eax
> - ret
> -END (STRCMP)
> -
> - .section .rodata,"a",@progbits
> - .p2align 3
> -LABEL(unaligned_table):
> - .int LABEL(ashr_1) - LABEL(unaligned_table)
> - .int LABEL(ashr_2) - LABEL(unaligned_table)
> - .int LABEL(ashr_3) - LABEL(unaligned_table)
> - .int LABEL(ashr_4) - LABEL(unaligned_table)
> - .int LABEL(ashr_5) - LABEL(unaligned_table)
> - .int LABEL(ashr_6) - LABEL(unaligned_table)
> - .int LABEL(ashr_7) - LABEL(unaligned_table)
> - .int LABEL(ashr_8) - LABEL(unaligned_table)
> - .int LABEL(ashr_9) - LABEL(unaligned_table)
> - .int LABEL(ashr_10) - LABEL(unaligned_table)
> - .int LABEL(ashr_11) - LABEL(unaligned_table)
> - .int LABEL(ashr_12) - LABEL(unaligned_table)
> - .int LABEL(ashr_13) - LABEL(unaligned_table)
> - .int LABEL(ashr_14) - LABEL(unaligned_table)
> - .int LABEL(ashr_15) - LABEL(unaligned_table)
> - .int LABEL(ashr_0) - LABEL(unaligned_table)
> -libc_hidden_builtin_def (STRCMP)
> +#include "multiarch/strcmp-sse2.S"
> +libc_hidden_builtin_def (strcmp)
> diff --git a/sysdeps/x86_64/strncase_l.S b/sysdeps/x86_64/strncase_l.S
> index c725cd85b3..3780fc50b1 100644
> --- a/sysdeps/x86_64/strncase_l.S
> +++ b/sysdeps/x86_64/strncase_l.S
> @@ -1,6 +1,11 @@
> -#define STRCMP __strncasecmp_l
> -#define USE_AS_STRNCASECMP_L
> -#include "strcmp.S"
> +/* Symbols = __strncasecmp_l and __strncasecmp. */
> +
> +#include "multiarch/strncase_l-sse2.S"
> +
> +libc_hidden_builtin_def (__strncasecmp_l)
>
> weak_alias (__strncasecmp_l, strncasecmp_l)
> libc_hidden_def (strncasecmp_l)
> +
> +weak_alias (__strncasecmp, strncasecmp)
> +libc_hidden_def (__strncasecmp)
> diff --git a/sysdeps/x86_64/strncmp.S b/sysdeps/x86_64/strncmp.S
> index 0af34e7f15..13d9e82ee2 100644
> --- a/sysdeps/x86_64/strncmp.S
> +++ b/sysdeps/x86_64/strncmp.S
> @@ -1,3 +1,4 @@
> -#define STRCMP strncmp
> -#define USE_AS_STRNCMP
> -#include "strcmp.S"
> +/* Symbol = strncmp. */
> +
> +#include "multiarch/strncmp-sse2.S"
> +libc_hidden_builtin_def (strncmp)
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH v1 3/4] x86: Move wcscmp SSE2 implementation to multiarch/wcscmp-sse2.S
2022-07-12 19:28 [PATCH v1 1/4] x86: Rename STRCASECMP_NONASCII macro to STRCASECMP_L_NONASCII Noah Goldstein
2022-07-12 19:28 ` [PATCH v1 2/4] x86: Move strcmp SSE2 implementation to multiarch/strcmp-sse2.S Noah Goldstein
@ 2022-07-12 19:28 ` Noah Goldstein
2022-07-13 0:05 ` H.J. Lu
2022-07-12 19:28 ` [PATCH v1 4/4] x86: Move strcmp SSE42 implementation to multiarch/strcmp-sse4_2.S Noah Goldstein
2022-07-12 23:30 ` [PATCH v1 1/4] x86: Rename STRCASECMP_NONASCII macro to STRCASECMP_L_NONASCII H.J. Lu
3 siblings, 1 reply; 8+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:28 UTC (permalink / raw)
To: libc-alpha
This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.
Tested build on x86_64 and x86_32 with/without multiarch.
---
sysdeps/x86_64/multiarch/wcscmp-sse2.S | 936 ++++++++++++++++++++++++-
sysdeps/x86_64/wcscmp.S | 932 +-----------------------
2 files changed, 934 insertions(+), 934 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/wcscmp-sse2.S b/sysdeps/x86_64/multiarch/wcscmp-sse2.S
index 72a19bd64d..6cb7d9faf9 100644
--- a/sysdeps/x86_64/multiarch/wcscmp-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcscmp-sse2.S
@@ -16,8 +16,936 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#if IS_IN (libc)
-# define __wcscmp __wcscmp_sse2
-#endif
+#define USE_AS_WCSCMP
+#define STRCMP_ISA _sse2
+#include "strcmp-naming.h"
-#include "../wcscmp.S"
+#include <sysdep.h>
+
+/* Note: wcscmp uses signed comparison, not unsighed as in strcmp function. */
+
+ .text
+ENTRY (STRCMP)
+/*
+ * This implementation uses SSE to compare up to 16 bytes at a time.
+*/
+ mov %esi, %eax
+ mov %edi, %edx
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
+ mov %al, %ch
+ mov %dl, %cl
+ and $63, %eax /* rsi alignment in cache line */
+ and $63, %edx /* rdi alignment in cache line */
+ and $15, %cl
+ jz L(continue_00)
+ cmp $16, %edx
+ jb L(continue_0)
+ cmp $32, %edx
+ jb L(continue_16)
+ cmp $48, %edx
+ jb L(continue_32)
+
+L(continue_48):
+ and $15, %ch
+ jz L(continue_48_00)
+ cmp $16, %eax
+ jb L(continue_0_48)
+ cmp $32, %eax
+ jb L(continue_16_48)
+ cmp $48, %eax
+ jb L(continue_32_48)
+
+ .p2align 4
+L(continue_48_48):
+ mov (%rsi), %ecx
+ cmp %ecx, (%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 4(%rsi), %ecx
+ cmp %ecx, 4(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%rsi), %ecx
+ cmp %ecx, 8(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%rsi), %ecx
+ cmp %ecx, 12(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 16(%rdi), %xmm1
+ movdqu 16(%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%rdi), %xmm1
+ movdqu 32(%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqu 48(%rdi), %xmm1
+ movdqu 48(%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %rsi
+ add $64, %rdi
+ jmp L(continue_48_48)
+
+L(continue_0):
+ and $15, %ch
+ jz L(continue_0_00)
+ cmp $16, %eax
+ jb L(continue_0_0)
+ cmp $32, %eax
+ jb L(continue_0_16)
+ cmp $48, %eax
+ jb L(continue_0_32)
+
+ .p2align 4
+L(continue_0_48):
+ mov (%rsi), %ecx
+ cmp %ecx, (%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 4(%rsi), %ecx
+ cmp %ecx, 4(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%rsi), %ecx
+ cmp %ecx, 8(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%rsi), %ecx
+ cmp %ecx, 12(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 16(%rdi), %xmm1
+ movdqu 16(%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%rdi), %xmm1
+ movdqu 32(%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ mov 48(%rsi), %ecx
+ cmp %ecx, 48(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 52(%rsi), %ecx
+ cmp %ecx, 52(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 56(%rsi), %ecx
+ cmp %ecx, 56(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 60(%rsi), %ecx
+ cmp %ecx, 60(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ add $64, %rsi
+ add $64, %rdi
+ jmp L(continue_0_48)
+
+ .p2align 4
+L(continue_00):
+ and $15, %ch
+ jz L(continue_00_00)
+ cmp $16, %eax
+ jb L(continue_00_0)
+ cmp $32, %eax
+ jb L(continue_00_16)
+ cmp $48, %eax
+ jb L(continue_00_32)
+
+ .p2align 4
+L(continue_00_48):
+ pcmpeqd (%rdi), %xmm0
+ mov (%rdi), %eax
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ jnz L(less4_double_words1)
+
+ cmp (%rsi), %eax
+ jne L(nequal)
+
+ mov 4(%rdi), %eax
+ cmp 4(%rsi), %eax
+ jne L(nequal)
+
+ mov 8(%rdi), %eax
+ cmp 8(%rsi), %eax
+ jne L(nequal)
+
+ mov 12(%rdi), %eax
+ cmp 12(%rsi), %eax
+ jne L(nequal)
+
+ movdqu 16(%rsi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%rsi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqu 48(%rsi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 48(%rdi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %rsi
+ add $64, %rdi
+ jmp L(continue_00_48)
+
+ .p2align 4
+L(continue_32):
+ and $15, %ch
+ jz L(continue_32_00)
+ cmp $16, %eax
+ jb L(continue_0_32)
+ cmp $32, %eax
+ jb L(continue_16_32)
+ cmp $48, %eax
+ jb L(continue_32_32)
+
+ .p2align 4
+L(continue_32_48):
+ mov (%rsi), %ecx
+ cmp %ecx, (%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 4(%rsi), %ecx
+ cmp %ecx, 4(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%rsi), %ecx
+ cmp %ecx, 8(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%rsi), %ecx
+ cmp %ecx, 12(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 16(%rsi), %ecx
+ cmp %ecx, 16(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 20(%rsi), %ecx
+ cmp %ecx, 20(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 24(%rsi), %ecx
+ cmp %ecx, 24(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 28(%rsi), %ecx
+ cmp %ecx, 28(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 32(%rdi), %xmm1
+ movdqu 32(%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqu 48(%rdi), %xmm1
+ movdqu 48(%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %rsi
+ add $64, %rdi
+ jmp L(continue_32_48)
+
+ .p2align 4
+L(continue_16):
+ and $15, %ch
+ jz L(continue_16_00)
+ cmp $16, %eax
+ jb L(continue_0_16)
+ cmp $32, %eax
+ jb L(continue_16_16)
+ cmp $48, %eax
+ jb L(continue_16_32)
+
+ .p2align 4
+L(continue_16_48):
+ mov (%rsi), %ecx
+ cmp %ecx, (%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 4(%rsi), %ecx
+ cmp %ecx, 4(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%rsi), %ecx
+ cmp %ecx, 8(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%rsi), %ecx
+ cmp %ecx, 12(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 16(%rdi), %xmm1
+ movdqu 16(%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ mov 32(%rsi), %ecx
+ cmp %ecx, 32(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 36(%rsi), %ecx
+ cmp %ecx, 36(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 40(%rsi), %ecx
+ cmp %ecx, 40(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 44(%rsi), %ecx
+ cmp %ecx, 44(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ movdqu 48(%rdi), %xmm1
+ movdqu 48(%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %rsi
+ add $64, %rdi
+ jmp L(continue_16_48)
+
+ .p2align 4
+L(continue_00_00):
+ movdqa (%rdi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqa 16(%rdi), %xmm3
+ pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%rsi), %xmm3 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm3 /* packed sub of comparison results*/
+ pmovmskb %xmm3, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqa 32(%rdi), %xmm5
+ pcmpeqd %xmm5, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%rsi), %xmm5 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm5 /* packed sub of comparison results*/
+ pmovmskb %xmm5, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqa 48(%rdi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %rsi
+ add $64, %rdi
+ jmp L(continue_00_00)
+
+ .p2align 4
+L(continue_00_32):
+ movdqu (%rsi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %rsi
+ add $16, %rdi
+ jmp L(continue_00_48)
+
+ .p2align 4
+L(continue_00_16):
+ movdqu (%rsi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%rsi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ add $32, %rsi
+ add $32, %rdi
+ jmp L(continue_00_48)
+
+ .p2align 4
+L(continue_00_0):
+ movdqu (%rsi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%rsi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%rsi), %xmm2
+ pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ add $48, %rsi
+ add $48, %rdi
+ jmp L(continue_00_48)
+
+ .p2align 4
+L(continue_48_00):
+ pcmpeqd (%rsi), %xmm0
+ mov (%rdi), %eax
+ pmovmskb %xmm0, %ecx
+ test %ecx, %ecx
+ jnz L(less4_double_words1)
+
+ cmp (%rsi), %eax
+ jne L(nequal)
+
+ mov 4(%rdi), %eax
+ cmp 4(%rsi), %eax
+ jne L(nequal)
+
+ mov 8(%rdi), %eax
+ cmp 8(%rsi), %eax
+ jne L(nequal)
+
+ mov 12(%rdi), %eax
+ cmp 12(%rsi), %eax
+ jne L(nequal)
+
+ movdqu 16(%rdi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%rdi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ movdqu 48(%rdi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_48)
+
+ add $64, %rsi
+ add $64, %rdi
+ jmp L(continue_48_00)
+
+ .p2align 4
+L(continue_32_00):
+ movdqu (%rdi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %rsi
+ add $16, %rdi
+ jmp L(continue_48_00)
+
+ .p2align 4
+L(continue_16_00):
+ movdqu (%rdi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%rdi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ add $32, %rsi
+ add $32, %rdi
+ jmp L(continue_48_00)
+
+ .p2align 4
+L(continue_0_00):
+ movdqu (%rdi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%rdi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%rdi), %xmm1
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ add $48, %rsi
+ add $48, %rdi
+ jmp L(continue_48_00)
+
+ .p2align 4
+L(continue_32_32):
+ movdqu (%rdi), %xmm1
+ movdqu (%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %rsi
+ add $16, %rdi
+ jmp L(continue_48_48)
+
+ .p2align 4
+L(continue_16_16):
+ movdqu (%rdi), %xmm1
+ movdqu (%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%rdi), %xmm3
+ movdqu 16(%rsi), %xmm4
+ pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm3 /* packed sub of comparison results*/
+ pmovmskb %xmm3, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ add $32, %rsi
+ add $32, %rdi
+ jmp L(continue_48_48)
+
+ .p2align 4
+L(continue_0_0):
+ movdqu (%rdi), %xmm1
+ movdqu (%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%rdi), %xmm3
+ movdqu 16(%rsi), %xmm4
+ pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm3 /* packed sub of comparison results*/
+ pmovmskb %xmm3, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ movdqu 32(%rdi), %xmm1
+ movdqu 32(%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_32)
+
+ add $48, %rsi
+ add $48, %rdi
+ jmp L(continue_48_48)
+
+ .p2align 4
+L(continue_0_16):
+ movdqu (%rdi), %xmm1
+ movdqu (%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ movdqu 16(%rdi), %xmm1
+ movdqu 16(%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words_16)
+
+ add $32, %rsi
+ add $32, %rdi
+ jmp L(continue_32_48)
+
+ .p2align 4
+L(continue_0_32):
+ movdqu (%rdi), %xmm1
+ movdqu (%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %rsi
+ add $16, %rdi
+ jmp L(continue_16_48)
+
+ .p2align 4
+L(continue_16_32):
+ movdqu (%rdi), %xmm1
+ movdqu (%rsi), %xmm2
+ pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
+ pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
+ jnz L(less4_double_words)
+
+ add $16, %rsi
+ add $16, %rdi
+ jmp L(continue_32_48)
+
+ .p2align 4
+L(less4_double_words1):
+ cmp (%rsi), %eax
+ jne L(nequal)
+ test %eax, %eax
+ jz L(equal)
+
+ mov 4(%rsi), %ecx
+ cmp %ecx, 4(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 8(%rsi), %ecx
+ cmp %ecx, 8(%rdi)
+ jne L(nequal)
+ test %ecx, %ecx
+ jz L(equal)
+
+ mov 12(%rsi), %ecx
+ cmp %ecx, 12(%rdi)
+ jne L(nequal)
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+L(less4_double_words):
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words)
+ and $15, %dl
+ jz L(second_double_word)
+ mov (%rdi), %eax
+ cmp (%rsi), %eax
+ jne L(nequal)
+ ret
+
+ .p2align 4
+L(second_double_word):
+ mov 4(%rdi), %eax
+ cmp 4(%rsi), %eax
+ jne L(nequal)
+ ret
+
+ .p2align 4
+L(next_two_double_words):
+ and $15, %dh
+ jz L(fourth_double_word)
+ mov 8(%rdi), %eax
+ cmp 8(%rsi), %eax
+ jne L(nequal)
+ ret
+
+ .p2align 4
+L(fourth_double_word):
+ mov 12(%rdi), %eax
+ cmp 12(%rsi), %eax
+ jne L(nequal)
+ ret
+
+ .p2align 4
+L(less4_double_words_16):
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words_16)
+ and $15, %dl
+ jz L(second_double_word_16)
+ mov 16(%rdi), %eax
+ cmp 16(%rsi), %eax
+ jne L(nequal)
+ ret
+
+ .p2align 4
+L(second_double_word_16):
+ mov 20(%rdi), %eax
+ cmp 20(%rsi), %eax
+ jne L(nequal)
+ ret
+
+ .p2align 4
+L(next_two_double_words_16):
+ and $15, %dh
+ jz L(fourth_double_word_16)
+ mov 24(%rdi), %eax
+ cmp 24(%rsi), %eax
+ jne L(nequal)
+ ret
+
+ .p2align 4
+L(fourth_double_word_16):
+ mov 28(%rdi), %eax
+ cmp 28(%rsi), %eax
+ jne L(nequal)
+ ret
+
+ .p2align 4
+L(less4_double_words_32):
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words_32)
+ and $15, %dl
+ jz L(second_double_word_32)
+ mov 32(%rdi), %eax
+ cmp 32(%rsi), %eax
+ jne L(nequal)
+ ret
+
+ .p2align 4
+L(second_double_word_32):
+ mov 36(%rdi), %eax
+ cmp 36(%rsi), %eax
+ jne L(nequal)
+ ret
+
+ .p2align 4
+L(next_two_double_words_32):
+ and $15, %dh
+ jz L(fourth_double_word_32)
+ mov 40(%rdi), %eax
+ cmp 40(%rsi), %eax
+ jne L(nequal)
+ ret
+
+ .p2align 4
+L(fourth_double_word_32):
+ mov 44(%rdi), %eax
+ cmp 44(%rsi), %eax
+ jne L(nequal)
+ ret
+
+ .p2align 4
+L(less4_double_words_48):
+ xor %eax, %eax
+ test %dl, %dl
+ jz L(next_two_double_words_48)
+ and $15, %dl
+ jz L(second_double_word_48)
+ mov 48(%rdi), %eax
+ cmp 48(%rsi), %eax
+ jne L(nequal)
+ ret
+
+ .p2align 4
+L(second_double_word_48):
+ mov 52(%rdi), %eax
+ cmp 52(%rsi), %eax
+ jne L(nequal)
+ ret
+
+ .p2align 4
+L(next_two_double_words_48):
+ and $15, %dh
+ jz L(fourth_double_word_48)
+ mov 56(%rdi), %eax
+ cmp 56(%rsi), %eax
+ jne L(nequal)
+ ret
+
+ .p2align 4
+L(fourth_double_word_48):
+ mov 60(%rdi), %eax
+ cmp 60(%rsi), %eax
+ jne L(nequal)
+ ret
+
+ .p2align 4
+L(nequal):
+ mov $1, %eax
+ jg L(nequal_bigger)
+ neg %eax
+
+L(nequal_bigger):
+ ret
+
+ .p2align 4
+L(equal):
+ xor %rax, %rax
+ ret
+
+END (STRCMP)
diff --git a/sysdeps/x86_64/wcscmp.S b/sysdeps/x86_64/wcscmp.S
index 5cb42f47b9..e04cdbf5fe 100644
--- a/sysdeps/x86_64/wcscmp.S
+++ b/sysdeps/x86_64/wcscmp.S
@@ -16,936 +16,8 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
+/* Symbol = __wcscmp. */
-/* Note: wcscmp uses signed comparison, not unsighed as in strcmp function. */
-
- .text
-ENTRY (__wcscmp)
-/*
- * This implementation uses SSE to compare up to 16 bytes at a time.
-*/
- mov %esi, %eax
- mov %edi, %edx
- pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
- mov %al, %ch
- mov %dl, %cl
- and $63, %eax /* rsi alignment in cache line */
- and $63, %edx /* rdi alignment in cache line */
- and $15, %cl
- jz L(continue_00)
- cmp $16, %edx
- jb L(continue_0)
- cmp $32, %edx
- jb L(continue_16)
- cmp $48, %edx
- jb L(continue_32)
-
-L(continue_48):
- and $15, %ch
- jz L(continue_48_00)
- cmp $16, %eax
- jb L(continue_0_48)
- cmp $32, %eax
- jb L(continue_16_48)
- cmp $48, %eax
- jb L(continue_32_48)
-
- .p2align 4
-L(continue_48_48):
- mov (%rsi), %ecx
- cmp %ecx, (%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 4(%rsi), %ecx
- cmp %ecx, 4(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 8(%rsi), %ecx
- cmp %ecx, 8(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 12(%rsi), %ecx
- cmp %ecx, 12(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- movdqu 16(%rdi), %xmm1
- movdqu 16(%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_16)
-
- movdqu 32(%rdi), %xmm1
- movdqu 32(%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_32)
-
- movdqu 48(%rdi), %xmm1
- movdqu 48(%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_48)
-
- add $64, %rsi
- add $64, %rdi
- jmp L(continue_48_48)
-
-L(continue_0):
- and $15, %ch
- jz L(continue_0_00)
- cmp $16, %eax
- jb L(continue_0_0)
- cmp $32, %eax
- jb L(continue_0_16)
- cmp $48, %eax
- jb L(continue_0_32)
-
- .p2align 4
-L(continue_0_48):
- mov (%rsi), %ecx
- cmp %ecx, (%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 4(%rsi), %ecx
- cmp %ecx, 4(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 8(%rsi), %ecx
- cmp %ecx, 8(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 12(%rsi), %ecx
- cmp %ecx, 12(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- movdqu 16(%rdi), %xmm1
- movdqu 16(%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_16)
-
- movdqu 32(%rdi), %xmm1
- movdqu 32(%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_32)
-
- mov 48(%rsi), %ecx
- cmp %ecx, 48(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 52(%rsi), %ecx
- cmp %ecx, 52(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 56(%rsi), %ecx
- cmp %ecx, 56(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 60(%rsi), %ecx
- cmp %ecx, 60(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- add $64, %rsi
- add $64, %rdi
- jmp L(continue_0_48)
-
- .p2align 4
-L(continue_00):
- and $15, %ch
- jz L(continue_00_00)
- cmp $16, %eax
- jb L(continue_00_0)
- cmp $32, %eax
- jb L(continue_00_16)
- cmp $48, %eax
- jb L(continue_00_32)
-
- .p2align 4
-L(continue_00_48):
- pcmpeqd (%rdi), %xmm0
- mov (%rdi), %eax
- pmovmskb %xmm0, %ecx
- test %ecx, %ecx
- jnz L(less4_double_words1)
-
- cmp (%rsi), %eax
- jne L(nequal)
-
- mov 4(%rdi), %eax
- cmp 4(%rsi), %eax
- jne L(nequal)
-
- mov 8(%rdi), %eax
- cmp 8(%rsi), %eax
- jne L(nequal)
-
- mov 12(%rdi), %eax
- cmp 12(%rsi), %eax
- jne L(nequal)
-
- movdqu 16(%rsi), %xmm2
- pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
- pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm2 /* packed sub of comparison results*/
- pmovmskb %xmm2, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_16)
-
- movdqu 32(%rsi), %xmm2
- pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
- pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm2 /* packed sub of comparison results*/
- pmovmskb %xmm2, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_32)
-
- movdqu 48(%rsi), %xmm2
- pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
- pcmpeqd 48(%rdi), %xmm2 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm2 /* packed sub of comparison results*/
- pmovmskb %xmm2, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_48)
-
- add $64, %rsi
- add $64, %rdi
- jmp L(continue_00_48)
-
- .p2align 4
-L(continue_32):
- and $15, %ch
- jz L(continue_32_00)
- cmp $16, %eax
- jb L(continue_0_32)
- cmp $32, %eax
- jb L(continue_16_32)
- cmp $48, %eax
- jb L(continue_32_32)
-
- .p2align 4
-L(continue_32_48):
- mov (%rsi), %ecx
- cmp %ecx, (%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 4(%rsi), %ecx
- cmp %ecx, 4(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 8(%rsi), %ecx
- cmp %ecx, 8(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 12(%rsi), %ecx
- cmp %ecx, 12(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 16(%rsi), %ecx
- cmp %ecx, 16(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 20(%rsi), %ecx
- cmp %ecx, 20(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 24(%rsi), %ecx
- cmp %ecx, 24(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 28(%rsi), %ecx
- cmp %ecx, 28(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- movdqu 32(%rdi), %xmm1
- movdqu 32(%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_32)
-
- movdqu 48(%rdi), %xmm1
- movdqu 48(%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_48)
-
- add $64, %rsi
- add $64, %rdi
- jmp L(continue_32_48)
-
- .p2align 4
-L(continue_16):
- and $15, %ch
- jz L(continue_16_00)
- cmp $16, %eax
- jb L(continue_0_16)
- cmp $32, %eax
- jb L(continue_16_16)
- cmp $48, %eax
- jb L(continue_16_32)
-
- .p2align 4
-L(continue_16_48):
- mov (%rsi), %ecx
- cmp %ecx, (%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 4(%rsi), %ecx
- cmp %ecx, 4(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 8(%rsi), %ecx
- cmp %ecx, 8(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 12(%rsi), %ecx
- cmp %ecx, 12(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- movdqu 16(%rdi), %xmm1
- movdqu 16(%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_16)
-
- mov 32(%rsi), %ecx
- cmp %ecx, 32(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 36(%rsi), %ecx
- cmp %ecx, 36(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 40(%rsi), %ecx
- cmp %ecx, 40(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 44(%rsi), %ecx
- cmp %ecx, 44(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- movdqu 48(%rdi), %xmm1
- movdqu 48(%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_48)
-
- add $64, %rsi
- add $64, %rdi
- jmp L(continue_16_48)
-
- .p2align 4
-L(continue_00_00):
- movdqa (%rdi), %xmm1
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words)
-
- movdqa 16(%rdi), %xmm3
- pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
- pcmpeqd 16(%rsi), %xmm3 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm3 /* packed sub of comparison results*/
- pmovmskb %xmm3, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_16)
-
- movdqa 32(%rdi), %xmm5
- pcmpeqd %xmm5, %xmm0 /* Any null double_word? */
- pcmpeqd 32(%rsi), %xmm5 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm5 /* packed sub of comparison results*/
- pmovmskb %xmm5, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_32)
-
- movdqa 48(%rdi), %xmm1
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_48)
-
- add $64, %rsi
- add $64, %rdi
- jmp L(continue_00_00)
-
- .p2align 4
-L(continue_00_32):
- movdqu (%rsi), %xmm2
- pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
- pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm2 /* packed sub of comparison results*/
- pmovmskb %xmm2, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words)
-
- add $16, %rsi
- add $16, %rdi
- jmp L(continue_00_48)
-
- .p2align 4
-L(continue_00_16):
- movdqu (%rsi), %xmm2
- pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
- pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm2 /* packed sub of comparison results*/
- pmovmskb %xmm2, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words)
-
- movdqu 16(%rsi), %xmm2
- pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
- pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm2 /* packed sub of comparison results*/
- pmovmskb %xmm2, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_16)
-
- add $32, %rsi
- add $32, %rdi
- jmp L(continue_00_48)
-
- .p2align 4
-L(continue_00_0):
- movdqu (%rsi), %xmm2
- pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
- pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm2 /* packed sub of comparison results*/
- pmovmskb %xmm2, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words)
-
- movdqu 16(%rsi), %xmm2
- pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
- pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm2 /* packed sub of comparison results*/
- pmovmskb %xmm2, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_16)
-
- movdqu 32(%rsi), %xmm2
- pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
- pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm2 /* packed sub of comparison results*/
- pmovmskb %xmm2, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_32)
-
- add $48, %rsi
- add $48, %rdi
- jmp L(continue_00_48)
-
- .p2align 4
-L(continue_48_00):
- pcmpeqd (%rsi), %xmm0
- mov (%rdi), %eax
- pmovmskb %xmm0, %ecx
- test %ecx, %ecx
- jnz L(less4_double_words1)
-
- cmp (%rsi), %eax
- jne L(nequal)
-
- mov 4(%rdi), %eax
- cmp 4(%rsi), %eax
- jne L(nequal)
-
- mov 8(%rdi), %eax
- cmp 8(%rsi), %eax
- jne L(nequal)
-
- mov 12(%rdi), %eax
- cmp 12(%rsi), %eax
- jne L(nequal)
-
- movdqu 16(%rdi), %xmm1
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_16)
-
- movdqu 32(%rdi), %xmm1
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_32)
-
- movdqu 48(%rdi), %xmm1
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_48)
-
- add $64, %rsi
- add $64, %rdi
- jmp L(continue_48_00)
-
- .p2align 4
-L(continue_32_00):
- movdqu (%rdi), %xmm1
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words)
-
- add $16, %rsi
- add $16, %rdi
- jmp L(continue_48_00)
-
- .p2align 4
-L(continue_16_00):
- movdqu (%rdi), %xmm1
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words)
-
- movdqu 16(%rdi), %xmm1
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_16)
-
- add $32, %rsi
- add $32, %rdi
- jmp L(continue_48_00)
-
- .p2align 4
-L(continue_0_00):
- movdqu (%rdi), %xmm1
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words)
-
- movdqu 16(%rdi), %xmm1
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_16)
-
- movdqu 32(%rdi), %xmm1
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_32)
-
- add $48, %rsi
- add $48, %rdi
- jmp L(continue_48_00)
-
- .p2align 4
-L(continue_32_32):
- movdqu (%rdi), %xmm1
- movdqu (%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words)
-
- add $16, %rsi
- add $16, %rdi
- jmp L(continue_48_48)
-
- .p2align 4
-L(continue_16_16):
- movdqu (%rdi), %xmm1
- movdqu (%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words)
-
- movdqu 16(%rdi), %xmm3
- movdqu 16(%rsi), %xmm4
- pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm3 /* packed sub of comparison results*/
- pmovmskb %xmm3, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_16)
-
- add $32, %rsi
- add $32, %rdi
- jmp L(continue_48_48)
-
- .p2align 4
-L(continue_0_0):
- movdqu (%rdi), %xmm1
- movdqu (%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words)
-
- movdqu 16(%rdi), %xmm3
- movdqu 16(%rsi), %xmm4
- pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm3 /* packed sub of comparison results*/
- pmovmskb %xmm3, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_16)
-
- movdqu 32(%rdi), %xmm1
- movdqu 32(%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_32)
-
- add $48, %rsi
- add $48, %rdi
- jmp L(continue_48_48)
-
- .p2align 4
-L(continue_0_16):
- movdqu (%rdi), %xmm1
- movdqu (%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words)
-
- movdqu 16(%rdi), %xmm1
- movdqu 16(%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words_16)
-
- add $32, %rsi
- add $32, %rdi
- jmp L(continue_32_48)
-
- .p2align 4
-L(continue_0_32):
- movdqu (%rdi), %xmm1
- movdqu (%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words)
-
- add $16, %rsi
- add $16, %rdi
- jmp L(continue_16_48)
-
- .p2align 4
-L(continue_16_32):
- movdqu (%rdi), %xmm1
- movdqu (%rsi), %xmm2
- pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
- pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
- jnz L(less4_double_words)
-
- add $16, %rsi
- add $16, %rdi
- jmp L(continue_32_48)
-
- .p2align 4
-L(less4_double_words1):
- cmp (%rsi), %eax
- jne L(nequal)
- test %eax, %eax
- jz L(equal)
-
- mov 4(%rsi), %ecx
- cmp %ecx, 4(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 8(%rsi), %ecx
- cmp %ecx, 8(%rdi)
- jne L(nequal)
- test %ecx, %ecx
- jz L(equal)
-
- mov 12(%rsi), %ecx
- cmp %ecx, 12(%rdi)
- jne L(nequal)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(less4_double_words):
- xor %eax, %eax
- test %dl, %dl
- jz L(next_two_double_words)
- and $15, %dl
- jz L(second_double_word)
- mov (%rdi), %eax
- cmp (%rsi), %eax
- jne L(nequal)
- ret
-
- .p2align 4
-L(second_double_word):
- mov 4(%rdi), %eax
- cmp 4(%rsi), %eax
- jne L(nequal)
- ret
-
- .p2align 4
-L(next_two_double_words):
- and $15, %dh
- jz L(fourth_double_word)
- mov 8(%rdi), %eax
- cmp 8(%rsi), %eax
- jne L(nequal)
- ret
-
- .p2align 4
-L(fourth_double_word):
- mov 12(%rdi), %eax
- cmp 12(%rsi), %eax
- jne L(nequal)
- ret
-
- .p2align 4
-L(less4_double_words_16):
- xor %eax, %eax
- test %dl, %dl
- jz L(next_two_double_words_16)
- and $15, %dl
- jz L(second_double_word_16)
- mov 16(%rdi), %eax
- cmp 16(%rsi), %eax
- jne L(nequal)
- ret
-
- .p2align 4
-L(second_double_word_16):
- mov 20(%rdi), %eax
- cmp 20(%rsi), %eax
- jne L(nequal)
- ret
-
- .p2align 4
-L(next_two_double_words_16):
- and $15, %dh
- jz L(fourth_double_word_16)
- mov 24(%rdi), %eax
- cmp 24(%rsi), %eax
- jne L(nequal)
- ret
-
- .p2align 4
-L(fourth_double_word_16):
- mov 28(%rdi), %eax
- cmp 28(%rsi), %eax
- jne L(nequal)
- ret
-
- .p2align 4
-L(less4_double_words_32):
- xor %eax, %eax
- test %dl, %dl
- jz L(next_two_double_words_32)
- and $15, %dl
- jz L(second_double_word_32)
- mov 32(%rdi), %eax
- cmp 32(%rsi), %eax
- jne L(nequal)
- ret
-
- .p2align 4
-L(second_double_word_32):
- mov 36(%rdi), %eax
- cmp 36(%rsi), %eax
- jne L(nequal)
- ret
-
- .p2align 4
-L(next_two_double_words_32):
- and $15, %dh
- jz L(fourth_double_word_32)
- mov 40(%rdi), %eax
- cmp 40(%rsi), %eax
- jne L(nequal)
- ret
-
- .p2align 4
-L(fourth_double_word_32):
- mov 44(%rdi), %eax
- cmp 44(%rsi), %eax
- jne L(nequal)
- ret
-
- .p2align 4
-L(less4_double_words_48):
- xor %eax, %eax
- test %dl, %dl
- jz L(next_two_double_words_48)
- and $15, %dl
- jz L(second_double_word_48)
- mov 48(%rdi), %eax
- cmp 48(%rsi), %eax
- jne L(nequal)
- ret
-
- .p2align 4
-L(second_double_word_48):
- mov 52(%rdi), %eax
- cmp 52(%rsi), %eax
- jne L(nequal)
- ret
-
- .p2align 4
-L(next_two_double_words_48):
- and $15, %dh
- jz L(fourth_double_word_48)
- mov 56(%rdi), %eax
- cmp 56(%rsi), %eax
- jne L(nequal)
- ret
-
- .p2align 4
-L(fourth_double_word_48):
- mov 60(%rdi), %eax
- cmp 60(%rsi), %eax
- jne L(nequal)
- ret
-
- .p2align 4
-L(nequal):
- mov $1, %eax
- jg L(nequal_bigger)
- neg %eax
-
-L(nequal_bigger):
- ret
-
- .p2align 4
-L(equal):
- xor %rax, %rax
- ret
-
-END (__wcscmp)
-#ifndef __wcscmp
+#include "multiarch/wcscmp-sse2.S"
libc_hidden_def (__wcscmp)
weak_alias (__wcscmp, wcscmp)
-#endif
--
2.34.1
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v1 3/4] x86: Move wcscmp SSE2 implementation to multiarch/wcscmp-sse2.S
2022-07-12 19:28 ` [PATCH v1 3/4] x86: Move wcscmp SSE2 implementation to multiarch/wcscmp-sse2.S Noah Goldstein
@ 2022-07-13 0:05 ` H.J. Lu
0 siblings, 0 replies; 8+ messages in thread
From: H.J. Lu @ 2022-07-13 0:05 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jul 12, 2022 at 12:28 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
> sysdeps/x86_64/multiarch/wcscmp-sse2.S | 936 ++++++++++++++++++++++++-
> sysdeps/x86_64/wcscmp.S | 932 +-----------------------
> 2 files changed, 934 insertions(+), 934 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/wcscmp-sse2.S b/sysdeps/x86_64/multiarch/wcscmp-sse2.S
> index 72a19bd64d..6cb7d9faf9 100644
> --- a/sysdeps/x86_64/multiarch/wcscmp-sse2.S
> +++ b/sysdeps/x86_64/multiarch/wcscmp-sse2.S
> @@ -16,8 +16,936 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#if IS_IN (libc)
> -# define __wcscmp __wcscmp_sse2
> -#endif
> +#define USE_AS_WCSCMP
> +#define STRCMP_ISA _sse2
> +#include "strcmp-naming.h"
>
> -#include "../wcscmp.S"
> +#include <sysdep.h>
> +
> +/* Note: wcscmp uses signed comparison, not unsighed as in strcmp function. */
> +
> + .text
> +ENTRY (STRCMP)
> +/*
> + * This implementation uses SSE to compare up to 16 bytes at a time.
> +*/
> + mov %esi, %eax
> + mov %edi, %edx
> + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
> + mov %al, %ch
> + mov %dl, %cl
> + and $63, %eax /* rsi alignment in cache line */
> + and $63, %edx /* rdi alignment in cache line */
> + and $15, %cl
> + jz L(continue_00)
> + cmp $16, %edx
> + jb L(continue_0)
> + cmp $32, %edx
> + jb L(continue_16)
> + cmp $48, %edx
> + jb L(continue_32)
> +
> +L(continue_48):
> + and $15, %ch
> + jz L(continue_48_00)
> + cmp $16, %eax
> + jb L(continue_0_48)
> + cmp $32, %eax
> + jb L(continue_16_48)
> + cmp $48, %eax
> + jb L(continue_32_48)
> +
> + .p2align 4
> +L(continue_48_48):
> + mov (%rsi), %ecx
> + cmp %ecx, (%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 4(%rsi), %ecx
> + cmp %ecx, 4(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 8(%rsi), %ecx
> + cmp %ecx, 8(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 12(%rsi), %ecx
> + cmp %ecx, 12(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + movdqu 16(%rdi), %xmm1
> + movdqu 16(%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_16)
> +
> + movdqu 32(%rdi), %xmm1
> + movdqu 32(%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_32)
> +
> + movdqu 48(%rdi), %xmm1
> + movdqu 48(%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_48)
> +
> + add $64, %rsi
> + add $64, %rdi
> + jmp L(continue_48_48)
> +
> +L(continue_0):
> + and $15, %ch
> + jz L(continue_0_00)
> + cmp $16, %eax
> + jb L(continue_0_0)
> + cmp $32, %eax
> + jb L(continue_0_16)
> + cmp $48, %eax
> + jb L(continue_0_32)
> +
> + .p2align 4
> +L(continue_0_48):
> + mov (%rsi), %ecx
> + cmp %ecx, (%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 4(%rsi), %ecx
> + cmp %ecx, 4(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 8(%rsi), %ecx
> + cmp %ecx, 8(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 12(%rsi), %ecx
> + cmp %ecx, 12(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + movdqu 16(%rdi), %xmm1
> + movdqu 16(%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_16)
> +
> + movdqu 32(%rdi), %xmm1
> + movdqu 32(%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_32)
> +
> + mov 48(%rsi), %ecx
> + cmp %ecx, 48(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 52(%rsi), %ecx
> + cmp %ecx, 52(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 56(%rsi), %ecx
> + cmp %ecx, 56(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 60(%rsi), %ecx
> + cmp %ecx, 60(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + add $64, %rsi
> + add $64, %rdi
> + jmp L(continue_0_48)
> +
> + .p2align 4
> +L(continue_00):
> + and $15, %ch
> + jz L(continue_00_00)
> + cmp $16, %eax
> + jb L(continue_00_0)
> + cmp $32, %eax
> + jb L(continue_00_16)
> + cmp $48, %eax
> + jb L(continue_00_32)
> +
> + .p2align 4
> +L(continue_00_48):
> + pcmpeqd (%rdi), %xmm0
> + mov (%rdi), %eax
> + pmovmskb %xmm0, %ecx
> + test %ecx, %ecx
> + jnz L(less4_double_words1)
> +
> + cmp (%rsi), %eax
> + jne L(nequal)
> +
> + mov 4(%rdi), %eax
> + cmp 4(%rsi), %eax
> + jne L(nequal)
> +
> + mov 8(%rdi), %eax
> + cmp 8(%rsi), %eax
> + jne L(nequal)
> +
> + mov 12(%rdi), %eax
> + cmp 12(%rsi), %eax
> + jne L(nequal)
> +
> + movdqu 16(%rsi), %xmm2
> + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> + pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> + pmovmskb %xmm2, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_16)
> +
> + movdqu 32(%rsi), %xmm2
> + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> + pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> + pmovmskb %xmm2, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_32)
> +
> + movdqu 48(%rsi), %xmm2
> + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> + pcmpeqd 48(%rdi), %xmm2 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> + pmovmskb %xmm2, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_48)
> +
> + add $64, %rsi
> + add $64, %rdi
> + jmp L(continue_00_48)
> +
> + .p2align 4
> +L(continue_32):
> + and $15, %ch
> + jz L(continue_32_00)
> + cmp $16, %eax
> + jb L(continue_0_32)
> + cmp $32, %eax
> + jb L(continue_16_32)
> + cmp $48, %eax
> + jb L(continue_32_32)
> +
> + .p2align 4
> +L(continue_32_48):
> + mov (%rsi), %ecx
> + cmp %ecx, (%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 4(%rsi), %ecx
> + cmp %ecx, 4(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 8(%rsi), %ecx
> + cmp %ecx, 8(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 12(%rsi), %ecx
> + cmp %ecx, 12(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 16(%rsi), %ecx
> + cmp %ecx, 16(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 20(%rsi), %ecx
> + cmp %ecx, 20(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 24(%rsi), %ecx
> + cmp %ecx, 24(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 28(%rsi), %ecx
> + cmp %ecx, 28(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + movdqu 32(%rdi), %xmm1
> + movdqu 32(%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_32)
> +
> + movdqu 48(%rdi), %xmm1
> + movdqu 48(%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_48)
> +
> + add $64, %rsi
> + add $64, %rdi
> + jmp L(continue_32_48)
> +
> + .p2align 4
> +L(continue_16):
> + and $15, %ch
> + jz L(continue_16_00)
> + cmp $16, %eax
> + jb L(continue_0_16)
> + cmp $32, %eax
> + jb L(continue_16_16)
> + cmp $48, %eax
> + jb L(continue_16_32)
> +
> + .p2align 4
> +L(continue_16_48):
> + mov (%rsi), %ecx
> + cmp %ecx, (%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 4(%rsi), %ecx
> + cmp %ecx, 4(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 8(%rsi), %ecx
> + cmp %ecx, 8(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 12(%rsi), %ecx
> + cmp %ecx, 12(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + movdqu 16(%rdi), %xmm1
> + movdqu 16(%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_16)
> +
> + mov 32(%rsi), %ecx
> + cmp %ecx, 32(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 36(%rsi), %ecx
> + cmp %ecx, 36(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 40(%rsi), %ecx
> + cmp %ecx, 40(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 44(%rsi), %ecx
> + cmp %ecx, 44(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + movdqu 48(%rdi), %xmm1
> + movdqu 48(%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_48)
> +
> + add $64, %rsi
> + add $64, %rdi
> + jmp L(continue_16_48)
> +
> + .p2align 4
> +L(continue_00_00):
> + movdqa (%rdi), %xmm1
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words)
> +
> + movdqa 16(%rdi), %xmm3
> + pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
> + pcmpeqd 16(%rsi), %xmm3 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm3 /* packed sub of comparison results*/
> + pmovmskb %xmm3, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_16)
> +
> + movdqa 32(%rdi), %xmm5
> + pcmpeqd %xmm5, %xmm0 /* Any null double_word? */
> + pcmpeqd 32(%rsi), %xmm5 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm5 /* packed sub of comparison results*/
> + pmovmskb %xmm5, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_32)
> +
> + movdqa 48(%rdi), %xmm1
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_48)
> +
> + add $64, %rsi
> + add $64, %rdi
> + jmp L(continue_00_00)
> +
> + .p2align 4
> +L(continue_00_32):
> + movdqu (%rsi), %xmm2
> + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> + pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> + pmovmskb %xmm2, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words)
> +
> + add $16, %rsi
> + add $16, %rdi
> + jmp L(continue_00_48)
> +
> + .p2align 4
> +L(continue_00_16):
> + movdqu (%rsi), %xmm2
> + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> + pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> + pmovmskb %xmm2, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words)
> +
> + movdqu 16(%rsi), %xmm2
> + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> + pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> + pmovmskb %xmm2, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_16)
> +
> + add $32, %rsi
> + add $32, %rdi
> + jmp L(continue_00_48)
> +
> + .p2align 4
> +L(continue_00_0):
> + movdqu (%rsi), %xmm2
> + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> + pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> + pmovmskb %xmm2, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words)
> +
> + movdqu 16(%rsi), %xmm2
> + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> + pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> + pmovmskb %xmm2, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_16)
> +
> + movdqu 32(%rsi), %xmm2
> + pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> + pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> + pmovmskb %xmm2, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_32)
> +
> + add $48, %rsi
> + add $48, %rdi
> + jmp L(continue_00_48)
> +
> + .p2align 4
> +L(continue_48_00):
> + pcmpeqd (%rsi), %xmm0
> + mov (%rdi), %eax
> + pmovmskb %xmm0, %ecx
> + test %ecx, %ecx
> + jnz L(less4_double_words1)
> +
> + cmp (%rsi), %eax
> + jne L(nequal)
> +
> + mov 4(%rdi), %eax
> + cmp 4(%rsi), %eax
> + jne L(nequal)
> +
> + mov 8(%rdi), %eax
> + cmp 8(%rsi), %eax
> + jne L(nequal)
> +
> + mov 12(%rdi), %eax
> + cmp 12(%rsi), %eax
> + jne L(nequal)
> +
> + movdqu 16(%rdi), %xmm1
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_16)
> +
> + movdqu 32(%rdi), %xmm1
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_32)
> +
> + movdqu 48(%rdi), %xmm1
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_48)
> +
> + add $64, %rsi
> + add $64, %rdi
> + jmp L(continue_48_00)
> +
> + .p2align 4
> +L(continue_32_00):
> + movdqu (%rdi), %xmm1
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words)
> +
> + add $16, %rsi
> + add $16, %rdi
> + jmp L(continue_48_00)
> +
> + .p2align 4
> +L(continue_16_00):
> + movdqu (%rdi), %xmm1
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words)
> +
> + movdqu 16(%rdi), %xmm1
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_16)
> +
> + add $32, %rsi
> + add $32, %rdi
> + jmp L(continue_48_00)
> +
> + .p2align 4
> +L(continue_0_00):
> + movdqu (%rdi), %xmm1
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words)
> +
> + movdqu 16(%rdi), %xmm1
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_16)
> +
> + movdqu 32(%rdi), %xmm1
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_32)
> +
> + add $48, %rsi
> + add $48, %rdi
> + jmp L(continue_48_00)
> +
> + .p2align 4
> +L(continue_32_32):
> + movdqu (%rdi), %xmm1
> + movdqu (%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words)
> +
> + add $16, %rsi
> + add $16, %rdi
> + jmp L(continue_48_48)
> +
> + .p2align 4
> +L(continue_16_16):
> + movdqu (%rdi), %xmm1
> + movdqu (%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words)
> +
> + movdqu 16(%rdi), %xmm3
> + movdqu 16(%rsi), %xmm4
> + pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm3 /* packed sub of comparison results*/
> + pmovmskb %xmm3, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_16)
> +
> + add $32, %rsi
> + add $32, %rdi
> + jmp L(continue_48_48)
> +
> + .p2align 4
> +L(continue_0_0):
> + movdqu (%rdi), %xmm1
> + movdqu (%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words)
> +
> + movdqu 16(%rdi), %xmm3
> + movdqu 16(%rsi), %xmm4
> + pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm3 /* packed sub of comparison results*/
> + pmovmskb %xmm3, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_16)
> +
> + movdqu 32(%rdi), %xmm1
> + movdqu 32(%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_32)
> +
> + add $48, %rsi
> + add $48, %rdi
> + jmp L(continue_48_48)
> +
> + .p2align 4
> +L(continue_0_16):
> + movdqu (%rdi), %xmm1
> + movdqu (%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words)
> +
> + movdqu 16(%rdi), %xmm1
> + movdqu 16(%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words_16)
> +
> + add $32, %rsi
> + add $32, %rdi
> + jmp L(continue_32_48)
> +
> + .p2align 4
> +L(continue_0_32):
> + movdqu (%rdi), %xmm1
> + movdqu (%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words)
> +
> + add $16, %rsi
> + add $16, %rdi
> + jmp L(continue_16_48)
> +
> + .p2align 4
> +L(continue_16_32):
> + movdqu (%rdi), %xmm1
> + movdqu (%rsi), %xmm2
> + pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> + pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> + jnz L(less4_double_words)
> +
> + add $16, %rsi
> + add $16, %rdi
> + jmp L(continue_32_48)
> +
> + .p2align 4
> +L(less4_double_words1):
> + cmp (%rsi), %eax
> + jne L(nequal)
> + test %eax, %eax
> + jz L(equal)
> +
> + mov 4(%rsi), %ecx
> + cmp %ecx, 4(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 8(%rsi), %ecx
> + cmp %ecx, 8(%rdi)
> + jne L(nequal)
> + test %ecx, %ecx
> + jz L(equal)
> +
> + mov 12(%rsi), %ecx
> + cmp %ecx, 12(%rdi)
> + jne L(nequal)
> + xor %eax, %eax
> + ret
> +
> + .p2align 4
> +L(less4_double_words):
> + xor %eax, %eax
> + test %dl, %dl
> + jz L(next_two_double_words)
> + and $15, %dl
> + jz L(second_double_word)
> + mov (%rdi), %eax
> + cmp (%rsi), %eax
> + jne L(nequal)
> + ret
> +
> + .p2align 4
> +L(second_double_word):
> + mov 4(%rdi), %eax
> + cmp 4(%rsi), %eax
> + jne L(nequal)
> + ret
> +
> + .p2align 4
> +L(next_two_double_words):
> + and $15, %dh
> + jz L(fourth_double_word)
> + mov 8(%rdi), %eax
> + cmp 8(%rsi), %eax
> + jne L(nequal)
> + ret
> +
> + .p2align 4
> +L(fourth_double_word):
> + mov 12(%rdi), %eax
> + cmp 12(%rsi), %eax
> + jne L(nequal)
> + ret
> +
> + .p2align 4
> +L(less4_double_words_16):
> + xor %eax, %eax
> + test %dl, %dl
> + jz L(next_two_double_words_16)
> + and $15, %dl
> + jz L(second_double_word_16)
> + mov 16(%rdi), %eax
> + cmp 16(%rsi), %eax
> + jne L(nequal)
> + ret
> +
> + .p2align 4
> +L(second_double_word_16):
> + mov 20(%rdi), %eax
> + cmp 20(%rsi), %eax
> + jne L(nequal)
> + ret
> +
> + .p2align 4
> +L(next_two_double_words_16):
> + and $15, %dh
> + jz L(fourth_double_word_16)
> + mov 24(%rdi), %eax
> + cmp 24(%rsi), %eax
> + jne L(nequal)
> + ret
> +
> + .p2align 4
> +L(fourth_double_word_16):
> + mov 28(%rdi), %eax
> + cmp 28(%rsi), %eax
> + jne L(nequal)
> + ret
> +
> + .p2align 4
> +L(less4_double_words_32):
> + xor %eax, %eax
> + test %dl, %dl
> + jz L(next_two_double_words_32)
> + and $15, %dl
> + jz L(second_double_word_32)
> + mov 32(%rdi), %eax
> + cmp 32(%rsi), %eax
> + jne L(nequal)
> + ret
> +
> + .p2align 4
> +L(second_double_word_32):
> + mov 36(%rdi), %eax
> + cmp 36(%rsi), %eax
> + jne L(nequal)
> + ret
> +
> + .p2align 4
> +L(next_two_double_words_32):
> + and $15, %dh
> + jz L(fourth_double_word_32)
> + mov 40(%rdi), %eax
> + cmp 40(%rsi), %eax
> + jne L(nequal)
> + ret
> +
> + .p2align 4
> +L(fourth_double_word_32):
> + mov 44(%rdi), %eax
> + cmp 44(%rsi), %eax
> + jne L(nequal)
> + ret
> +
> + .p2align 4
> +L(less4_double_words_48):
> + xor %eax, %eax
> + test %dl, %dl
> + jz L(next_two_double_words_48)
> + and $15, %dl
> + jz L(second_double_word_48)
> + mov 48(%rdi), %eax
> + cmp 48(%rsi), %eax
> + jne L(nequal)
> + ret
> +
> + .p2align 4
> +L(second_double_word_48):
> + mov 52(%rdi), %eax
> + cmp 52(%rsi), %eax
> + jne L(nequal)
> + ret
> +
> + .p2align 4
> +L(next_two_double_words_48):
> + and $15, %dh
> + jz L(fourth_double_word_48)
> + mov 56(%rdi), %eax
> + cmp 56(%rsi), %eax
> + jne L(nequal)
> + ret
> +
> + .p2align 4
> +L(fourth_double_word_48):
> + mov 60(%rdi), %eax
> + cmp 60(%rsi), %eax
> + jne L(nequal)
> + ret
> +
> + .p2align 4
> +L(nequal):
> + mov $1, %eax
> + jg L(nequal_bigger)
> + neg %eax
> +
> +L(nequal_bigger):
> + ret
> +
> + .p2align 4
> +L(equal):
> + xor %rax, %rax
> + ret
> +
> +END (STRCMP)
> diff --git a/sysdeps/x86_64/wcscmp.S b/sysdeps/x86_64/wcscmp.S
> index 5cb42f47b9..e04cdbf5fe 100644
> --- a/sysdeps/x86_64/wcscmp.S
> +++ b/sysdeps/x86_64/wcscmp.S
> @@ -16,936 +16,8 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <sysdep.h>
> +/* Symbol = __wcscmp. */
>
> -/* Note: wcscmp uses signed comparison, not unsighed as in strcmp function. */
> -
> - .text
> -ENTRY (__wcscmp)
> -/*
> - * This implementation uses SSE to compare up to 16 bytes at a time.
> -*/
> - mov %esi, %eax
> - mov %edi, %edx
> - pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
> - mov %al, %ch
> - mov %dl, %cl
> - and $63, %eax /* rsi alignment in cache line */
> - and $63, %edx /* rdi alignment in cache line */
> - and $15, %cl
> - jz L(continue_00)
> - cmp $16, %edx
> - jb L(continue_0)
> - cmp $32, %edx
> - jb L(continue_16)
> - cmp $48, %edx
> - jb L(continue_32)
> -
> -L(continue_48):
> - and $15, %ch
> - jz L(continue_48_00)
> - cmp $16, %eax
> - jb L(continue_0_48)
> - cmp $32, %eax
> - jb L(continue_16_48)
> - cmp $48, %eax
> - jb L(continue_32_48)
> -
> - .p2align 4
> -L(continue_48_48):
> - mov (%rsi), %ecx
> - cmp %ecx, (%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 4(%rsi), %ecx
> - cmp %ecx, 4(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 8(%rsi), %ecx
> - cmp %ecx, 8(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 12(%rsi), %ecx
> - cmp %ecx, 12(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - movdqu 16(%rdi), %xmm1
> - movdqu 16(%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_16)
> -
> - movdqu 32(%rdi), %xmm1
> - movdqu 32(%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_32)
> -
> - movdqu 48(%rdi), %xmm1
> - movdqu 48(%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_48)
> -
> - add $64, %rsi
> - add $64, %rdi
> - jmp L(continue_48_48)
> -
> -L(continue_0):
> - and $15, %ch
> - jz L(continue_0_00)
> - cmp $16, %eax
> - jb L(continue_0_0)
> - cmp $32, %eax
> - jb L(continue_0_16)
> - cmp $48, %eax
> - jb L(continue_0_32)
> -
> - .p2align 4
> -L(continue_0_48):
> - mov (%rsi), %ecx
> - cmp %ecx, (%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 4(%rsi), %ecx
> - cmp %ecx, 4(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 8(%rsi), %ecx
> - cmp %ecx, 8(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 12(%rsi), %ecx
> - cmp %ecx, 12(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - movdqu 16(%rdi), %xmm1
> - movdqu 16(%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_16)
> -
> - movdqu 32(%rdi), %xmm1
> - movdqu 32(%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_32)
> -
> - mov 48(%rsi), %ecx
> - cmp %ecx, 48(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 52(%rsi), %ecx
> - cmp %ecx, 52(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 56(%rsi), %ecx
> - cmp %ecx, 56(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 60(%rsi), %ecx
> - cmp %ecx, 60(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - add $64, %rsi
> - add $64, %rdi
> - jmp L(continue_0_48)
> -
> - .p2align 4
> -L(continue_00):
> - and $15, %ch
> - jz L(continue_00_00)
> - cmp $16, %eax
> - jb L(continue_00_0)
> - cmp $32, %eax
> - jb L(continue_00_16)
> - cmp $48, %eax
> - jb L(continue_00_32)
> -
> - .p2align 4
> -L(continue_00_48):
> - pcmpeqd (%rdi), %xmm0
> - mov (%rdi), %eax
> - pmovmskb %xmm0, %ecx
> - test %ecx, %ecx
> - jnz L(less4_double_words1)
> -
> - cmp (%rsi), %eax
> - jne L(nequal)
> -
> - mov 4(%rdi), %eax
> - cmp 4(%rsi), %eax
> - jne L(nequal)
> -
> - mov 8(%rdi), %eax
> - cmp 8(%rsi), %eax
> - jne L(nequal)
> -
> - mov 12(%rdi), %eax
> - cmp 12(%rsi), %eax
> - jne L(nequal)
> -
> - movdqu 16(%rsi), %xmm2
> - pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> - pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> - pmovmskb %xmm2, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_16)
> -
> - movdqu 32(%rsi), %xmm2
> - pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> - pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> - pmovmskb %xmm2, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_32)
> -
> - movdqu 48(%rsi), %xmm2
> - pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> - pcmpeqd 48(%rdi), %xmm2 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> - pmovmskb %xmm2, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_48)
> -
> - add $64, %rsi
> - add $64, %rdi
> - jmp L(continue_00_48)
> -
> - .p2align 4
> -L(continue_32):
> - and $15, %ch
> - jz L(continue_32_00)
> - cmp $16, %eax
> - jb L(continue_0_32)
> - cmp $32, %eax
> - jb L(continue_16_32)
> - cmp $48, %eax
> - jb L(continue_32_32)
> -
> - .p2align 4
> -L(continue_32_48):
> - mov (%rsi), %ecx
> - cmp %ecx, (%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 4(%rsi), %ecx
> - cmp %ecx, 4(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 8(%rsi), %ecx
> - cmp %ecx, 8(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 12(%rsi), %ecx
> - cmp %ecx, 12(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 16(%rsi), %ecx
> - cmp %ecx, 16(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 20(%rsi), %ecx
> - cmp %ecx, 20(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 24(%rsi), %ecx
> - cmp %ecx, 24(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 28(%rsi), %ecx
> - cmp %ecx, 28(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - movdqu 32(%rdi), %xmm1
> - movdqu 32(%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_32)
> -
> - movdqu 48(%rdi), %xmm1
> - movdqu 48(%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_48)
> -
> - add $64, %rsi
> - add $64, %rdi
> - jmp L(continue_32_48)
> -
> - .p2align 4
> -L(continue_16):
> - and $15, %ch
> - jz L(continue_16_00)
> - cmp $16, %eax
> - jb L(continue_0_16)
> - cmp $32, %eax
> - jb L(continue_16_16)
> - cmp $48, %eax
> - jb L(continue_16_32)
> -
> - .p2align 4
> -L(continue_16_48):
> - mov (%rsi), %ecx
> - cmp %ecx, (%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 4(%rsi), %ecx
> - cmp %ecx, 4(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 8(%rsi), %ecx
> - cmp %ecx, 8(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 12(%rsi), %ecx
> - cmp %ecx, 12(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - movdqu 16(%rdi), %xmm1
> - movdqu 16(%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_16)
> -
> - mov 32(%rsi), %ecx
> - cmp %ecx, 32(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 36(%rsi), %ecx
> - cmp %ecx, 36(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 40(%rsi), %ecx
> - cmp %ecx, 40(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 44(%rsi), %ecx
> - cmp %ecx, 44(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - movdqu 48(%rdi), %xmm1
> - movdqu 48(%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_48)
> -
> - add $64, %rsi
> - add $64, %rdi
> - jmp L(continue_16_48)
> -
> - .p2align 4
> -L(continue_00_00):
> - movdqa (%rdi), %xmm1
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words)
> -
> - movdqa 16(%rdi), %xmm3
> - pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
> - pcmpeqd 16(%rsi), %xmm3 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm3 /* packed sub of comparison results*/
> - pmovmskb %xmm3, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_16)
> -
> - movdqa 32(%rdi), %xmm5
> - pcmpeqd %xmm5, %xmm0 /* Any null double_word? */
> - pcmpeqd 32(%rsi), %xmm5 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm5 /* packed sub of comparison results*/
> - pmovmskb %xmm5, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_32)
> -
> - movdqa 48(%rdi), %xmm1
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_48)
> -
> - add $64, %rsi
> - add $64, %rdi
> - jmp L(continue_00_00)
> -
> - .p2align 4
> -L(continue_00_32):
> - movdqu (%rsi), %xmm2
> - pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> - pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> - pmovmskb %xmm2, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words)
> -
> - add $16, %rsi
> - add $16, %rdi
> - jmp L(continue_00_48)
> -
> - .p2align 4
> -L(continue_00_16):
> - movdqu (%rsi), %xmm2
> - pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> - pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> - pmovmskb %xmm2, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words)
> -
> - movdqu 16(%rsi), %xmm2
> - pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> - pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> - pmovmskb %xmm2, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_16)
> -
> - add $32, %rsi
> - add $32, %rdi
> - jmp L(continue_00_48)
> -
> - .p2align 4
> -L(continue_00_0):
> - movdqu (%rsi), %xmm2
> - pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> - pcmpeqd (%rdi), %xmm2 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> - pmovmskb %xmm2, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words)
> -
> - movdqu 16(%rsi), %xmm2
> - pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> - pcmpeqd 16(%rdi), %xmm2 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> - pmovmskb %xmm2, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_16)
> -
> - movdqu 32(%rsi), %xmm2
> - pcmpeqd %xmm2, %xmm0 /* Any null double_word? */
> - pcmpeqd 32(%rdi), %xmm2 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> - pmovmskb %xmm2, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_32)
> -
> - add $48, %rsi
> - add $48, %rdi
> - jmp L(continue_00_48)
> -
> - .p2align 4
> -L(continue_48_00):
> - pcmpeqd (%rsi), %xmm0
> - mov (%rdi), %eax
> - pmovmskb %xmm0, %ecx
> - test %ecx, %ecx
> - jnz L(less4_double_words1)
> -
> - cmp (%rsi), %eax
> - jne L(nequal)
> -
> - mov 4(%rdi), %eax
> - cmp 4(%rsi), %eax
> - jne L(nequal)
> -
> - mov 8(%rdi), %eax
> - cmp 8(%rsi), %eax
> - jne L(nequal)
> -
> - mov 12(%rdi), %eax
> - cmp 12(%rsi), %eax
> - jne L(nequal)
> -
> - movdqu 16(%rdi), %xmm1
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_16)
> -
> - movdqu 32(%rdi), %xmm1
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_32)
> -
> - movdqu 48(%rdi), %xmm1
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd 48(%rsi), %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_48)
> -
> - add $64, %rsi
> - add $64, %rdi
> - jmp L(continue_48_00)
> -
> - .p2align 4
> -L(continue_32_00):
> - movdqu (%rdi), %xmm1
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words)
> -
> - add $16, %rsi
> - add $16, %rdi
> - jmp L(continue_48_00)
> -
> - .p2align 4
> -L(continue_16_00):
> - movdqu (%rdi), %xmm1
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words)
> -
> - movdqu 16(%rdi), %xmm1
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_16)
> -
> - add $32, %rsi
> - add $32, %rdi
> - jmp L(continue_48_00)
> -
> - .p2align 4
> -L(continue_0_00):
> - movdqu (%rdi), %xmm1
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd (%rsi), %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words)
> -
> - movdqu 16(%rdi), %xmm1
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd 16(%rsi), %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_16)
> -
> - movdqu 32(%rdi), %xmm1
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd 32(%rsi), %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_32)
> -
> - add $48, %rsi
> - add $48, %rdi
> - jmp L(continue_48_00)
> -
> - .p2align 4
> -L(continue_32_32):
> - movdqu (%rdi), %xmm1
> - movdqu (%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words)
> -
> - add $16, %rsi
> - add $16, %rdi
> - jmp L(continue_48_48)
> -
> - .p2align 4
> -L(continue_16_16):
> - movdqu (%rdi), %xmm1
> - movdqu (%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words)
> -
> - movdqu 16(%rdi), %xmm3
> - movdqu 16(%rsi), %xmm4
> - pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm3 /* packed sub of comparison results*/
> - pmovmskb %xmm3, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_16)
> -
> - add $32, %rsi
> - add $32, %rdi
> - jmp L(continue_48_48)
> -
> - .p2align 4
> -L(continue_0_0):
> - movdqu (%rdi), %xmm1
> - movdqu (%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words)
> -
> - movdqu 16(%rdi), %xmm3
> - movdqu 16(%rsi), %xmm4
> - pcmpeqd %xmm3, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm4, %xmm3 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm3 /* packed sub of comparison results*/
> - pmovmskb %xmm3, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_16)
> -
> - movdqu 32(%rdi), %xmm1
> - movdqu 32(%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_32)
> -
> - add $48, %rsi
> - add $48, %rdi
> - jmp L(continue_48_48)
> -
> - .p2align 4
> -L(continue_0_16):
> - movdqu (%rdi), %xmm1
> - movdqu (%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words)
> -
> - movdqu 16(%rdi), %xmm1
> - movdqu 16(%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words_16)
> -
> - add $32, %rsi
> - add $32, %rdi
> - jmp L(continue_32_48)
> -
> - .p2align 4
> -L(continue_0_32):
> - movdqu (%rdi), %xmm1
> - movdqu (%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words)
> -
> - add $16, %rsi
> - add $16, %rdi
> - jmp L(continue_16_48)
> -
> - .p2align 4
> -L(continue_16_32):
> - movdqu (%rdi), %xmm1
> - movdqu (%rsi), %xmm2
> - pcmpeqd %xmm1, %xmm0 /* Any null double_word? */
> - pcmpeqd %xmm2, %xmm1 /* compare first 4 double_words for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 4 double_words are same, edx == 0xffff */
> - jnz L(less4_double_words)
> -
> - add $16, %rsi
> - add $16, %rdi
> - jmp L(continue_32_48)
> -
> - .p2align 4
> -L(less4_double_words1):
> - cmp (%rsi), %eax
> - jne L(nequal)
> - test %eax, %eax
> - jz L(equal)
> -
> - mov 4(%rsi), %ecx
> - cmp %ecx, 4(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 8(%rsi), %ecx
> - cmp %ecx, 8(%rdi)
> - jne L(nequal)
> - test %ecx, %ecx
> - jz L(equal)
> -
> - mov 12(%rsi), %ecx
> - cmp %ecx, 12(%rdi)
> - jne L(nequal)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(less4_double_words):
> - xor %eax, %eax
> - test %dl, %dl
> - jz L(next_two_double_words)
> - and $15, %dl
> - jz L(second_double_word)
> - mov (%rdi), %eax
> - cmp (%rsi), %eax
> - jne L(nequal)
> - ret
> -
> - .p2align 4
> -L(second_double_word):
> - mov 4(%rdi), %eax
> - cmp 4(%rsi), %eax
> - jne L(nequal)
> - ret
> -
> - .p2align 4
> -L(next_two_double_words):
> - and $15, %dh
> - jz L(fourth_double_word)
> - mov 8(%rdi), %eax
> - cmp 8(%rsi), %eax
> - jne L(nequal)
> - ret
> -
> - .p2align 4
> -L(fourth_double_word):
> - mov 12(%rdi), %eax
> - cmp 12(%rsi), %eax
> - jne L(nequal)
> - ret
> -
> - .p2align 4
> -L(less4_double_words_16):
> - xor %eax, %eax
> - test %dl, %dl
> - jz L(next_two_double_words_16)
> - and $15, %dl
> - jz L(second_double_word_16)
> - mov 16(%rdi), %eax
> - cmp 16(%rsi), %eax
> - jne L(nequal)
> - ret
> -
> - .p2align 4
> -L(second_double_word_16):
> - mov 20(%rdi), %eax
> - cmp 20(%rsi), %eax
> - jne L(nequal)
> - ret
> -
> - .p2align 4
> -L(next_two_double_words_16):
> - and $15, %dh
> - jz L(fourth_double_word_16)
> - mov 24(%rdi), %eax
> - cmp 24(%rsi), %eax
> - jne L(nequal)
> - ret
> -
> - .p2align 4
> -L(fourth_double_word_16):
> - mov 28(%rdi), %eax
> - cmp 28(%rsi), %eax
> - jne L(nequal)
> - ret
> -
> - .p2align 4
> -L(less4_double_words_32):
> - xor %eax, %eax
> - test %dl, %dl
> - jz L(next_two_double_words_32)
> - and $15, %dl
> - jz L(second_double_word_32)
> - mov 32(%rdi), %eax
> - cmp 32(%rsi), %eax
> - jne L(nequal)
> - ret
> -
> - .p2align 4
> -L(second_double_word_32):
> - mov 36(%rdi), %eax
> - cmp 36(%rsi), %eax
> - jne L(nequal)
> - ret
> -
> - .p2align 4
> -L(next_two_double_words_32):
> - and $15, %dh
> - jz L(fourth_double_word_32)
> - mov 40(%rdi), %eax
> - cmp 40(%rsi), %eax
> - jne L(nequal)
> - ret
> -
> - .p2align 4
> -L(fourth_double_word_32):
> - mov 44(%rdi), %eax
> - cmp 44(%rsi), %eax
> - jne L(nequal)
> - ret
> -
> - .p2align 4
> -L(less4_double_words_48):
> - xor %eax, %eax
> - test %dl, %dl
> - jz L(next_two_double_words_48)
> - and $15, %dl
> - jz L(second_double_word_48)
> - mov 48(%rdi), %eax
> - cmp 48(%rsi), %eax
> - jne L(nequal)
> - ret
> -
> - .p2align 4
> -L(second_double_word_48):
> - mov 52(%rdi), %eax
> - cmp 52(%rsi), %eax
> - jne L(nequal)
> - ret
> -
> - .p2align 4
> -L(next_two_double_words_48):
> - and $15, %dh
> - jz L(fourth_double_word_48)
> - mov 56(%rdi), %eax
> - cmp 56(%rsi), %eax
> - jne L(nequal)
> - ret
> -
> - .p2align 4
> -L(fourth_double_word_48):
> - mov 60(%rdi), %eax
> - cmp 60(%rsi), %eax
> - jne L(nequal)
> - ret
> -
> - .p2align 4
> -L(nequal):
> - mov $1, %eax
> - jg L(nequal_bigger)
> - neg %eax
> -
> -L(nequal_bigger):
> - ret
> -
> - .p2align 4
> -L(equal):
> - xor %rax, %rax
> - ret
> -
> -END (__wcscmp)
> -#ifndef __wcscmp
> +#include "multiarch/wcscmp-sse2.S"
> libc_hidden_def (__wcscmp)
> weak_alias (__wcscmp, wcscmp)
> -#endif
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH v1 4/4] x86: Move strcmp SSE42 implementation to multiarch/strcmp-sse4_2.S
2022-07-12 19:28 [PATCH v1 1/4] x86: Rename STRCASECMP_NONASCII macro to STRCASECMP_L_NONASCII Noah Goldstein
2022-07-12 19:28 ` [PATCH v1 2/4] x86: Move strcmp SSE2 implementation to multiarch/strcmp-sse2.S Noah Goldstein
2022-07-12 19:28 ` [PATCH v1 3/4] x86: Move wcscmp SSE2 implementation to multiarch/wcscmp-sse2.S Noah Goldstein
@ 2022-07-12 19:28 ` Noah Goldstein
2022-07-13 0:07 ` H.J. Lu
2022-07-12 23:30 ` [PATCH v1 1/4] x86: Rename STRCASECMP_NONASCII macro to STRCASECMP_L_NONASCII H.J. Lu
3 siblings, 1 reply; 8+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:28 UTC (permalink / raw)
To: libc-alpha
This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.
Tested build on x86_64 and x86_32 with/without multiarch.
---
.../x86_64/multiarch/strcasecmp_l-sse4_2.S | 3 +-
sysdeps/x86_64/multiarch/strcmp-sse42.S | 1782 -----------------
sysdeps/x86_64/multiarch/strcmp-sse4_2.S | 1763 +++++++++++++++-
sysdeps/x86_64/multiarch/strncase_l-sse4_2.S | 3 +-
sysdeps/x86_64/multiarch/strncmp-sse4_2.S | 7 +-
5 files changed, 1766 insertions(+), 1792 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/strcmp-sse42.S
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S
index 411ab7d283..ac03b95756 100644
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S
@@ -16,6 +16,5 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#define STRCMP_SSE42 __strcasecmp_l_sse42
#define USE_AS_STRCASECMP_L
-#include "strcmp-sse42.S"
+#include "strcmp-sse4_2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
deleted file mode 100644
index 60313c647a..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ /dev/null
@@ -1,1782 +0,0 @@
-/* strcmp with SSE4.2
- Copyright (C) 2009-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#ifndef STRCMP_SSE42
-# define STRCMP_SSE42 __strcmp_sse42
-#endif
-
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# include "locale-defines.h"
-#endif
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
-/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
- if the new counter > the old one or is 0. */
-# define UPDATE_STRNCMP_COUNTER \
- /* calculate left number to compare */ \
- lea -16(%rcx, %r11), %r9; \
- cmp %r9, %r11; \
- jb LABEL(strcmp_exitz); \
- test %r9, %r9; \
- je LABEL(strcmp_exitz); \
- mov %r9, %r11
-#else
-# define UPDATE_STRNCMP_COUNTER
-#endif
-
-#define SECTION sse4.2
-#define GLABEL(l) l##_sse42
-
-#define LABEL(l) .L##l
-
-/* We use 0x1a:
- _SIDD_SBYTE_OPS
- | _SIDD_CMP_EQUAL_EACH
- | _SIDD_NEGATIVE_POLARITY
- | _SIDD_LEAST_SIGNIFICANT
- on pcmpistri to find out if two 16byte data elements are the same
- and the offset of the first different byte. There are 4 cases:
-
- 1. Both 16byte data elements are valid and identical.
- 2. Both 16byte data elements have EOS and identical.
- 3. Both 16byte data elements are valid and they differ at offset X.
- 4. At least one 16byte data element has EOS at offset X. Two 16byte
- data elements must differ at or before offset X.
-
- Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
-
- case ECX CFlag ZFlag SFlag
- 1 16 0 0 0
- 2 16 0 1 1
- 3 X 1 0 0
- 4 0 <= X 1 0/1 0/1
-
- We exit from the loop for cases 2, 3 and 4 with jbe which branches
- when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
- case 2. */
-
- /* Put all SSE 4.2 functions together. */
- .section .text.SECTION,"ax",@progbits
- .align 16
- .type STRCMP_SSE42, @function
- .globl STRCMP_SSE42
-#ifdef USE_AS_STRCASECMP_L
-ENTRY (GLABEL(__strcasecmp))
- movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
- mov %fs:(%rax),%RDX_LP
-
- /* Either 1 or 5 bytes (dependeing if CET is enabled). */
- .p2align 4
-END (GLABEL(__strcasecmp))
- /* FALLTHROUGH to strcasecmp_l. */
-#endif
-#ifdef USE_AS_STRNCASECMP_L
-ENTRY (GLABEL(__strncasecmp))
- movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
- mov %fs:(%rax),%RCX_LP
-
- /* Either 1 or 5 bytes (dependeing if CET is enabled). */
- .p2align 4
-END (GLABEL(__strncasecmp))
- /* FALLTHROUGH to strncasecmp_l. */
-#endif
-
-
-#define arg arg
-
-STRCMP_SSE42:
- cfi_startproc
- _CET_ENDBR
- CALL_MCOUNT
-
-/*
- * This implementation uses SSE to compare up to 16 bytes at a time.
- */
-#ifdef USE_AS_STRCASECMP_L
- /* We have to fall back on the C implementation for locales
- with encodings not matching ASCII for single bytes. */
-# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
- mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
-# else
- mov (%rdx), %RAX_LP
-# endif
- testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
- jne __strcasecmp_l_nonascii
-#endif
-#ifdef USE_AS_STRNCASECMP_L
- /* We have to fall back on the C implementation for locales
- with encodings not matching ASCII for single bytes. */
-# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
- mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
-# else
- mov (%rcx), %RAX_LP
-# endif
- testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
- jne __strncasecmp_l_nonascii
-#endif
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- test %RDX_LP, %RDX_LP
- je LABEL(strcmp_exitz)
- cmp $1, %RDX_LP
- je LABEL(Byte0)
- mov %RDX_LP, %R11_LP
-#endif
- mov %esi, %ecx
- mov %edi, %eax
-/* Use 64bit AND here to avoid long NOP padding. */
- and $0x3f, %rcx /* rsi alignment in cache line */
- and $0x3f, %rax /* rdi alignment in cache line */
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- .section .rodata.cst16,"aM",@progbits,16
- .align 16
-LABEL(lcase_min):
- .quad 0x3f3f3f3f3f3f3f3f
- .quad 0x3f3f3f3f3f3f3f3f
-LABEL(lcase_max):
- .quad 0x9999999999999999
- .quad 0x9999999999999999
-LABEL(case_add):
- .quad 0x2020202020202020
- .quad 0x2020202020202020
- .previous
- movdqa LABEL(lcase_min)(%rip), %xmm4
-# define LCASE_MIN_reg %xmm4
- movdqa LABEL(lcase_max)(%rip), %xmm5
-# define LCASE_MAX_reg %xmm5
- movdqa LABEL(case_add)(%rip), %xmm6
-# define CASE_ADD_reg %xmm6
-#endif
- cmp $0x30, %ecx
- ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
- cmp $0x30, %eax
- ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
- movdqu (%rdi), %xmm1
- movdqu (%rsi), %xmm2
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
-# define TOLOWER(reg1, reg2) \
- movdqa LCASE_MIN_reg, %xmm7; \
- movdqa LCASE_MIN_reg, %xmm8; \
- paddb reg1, %xmm7; \
- paddb reg2, %xmm8; \
- pcmpgtb LCASE_MAX_reg, %xmm7; \
- pcmpgtb LCASE_MAX_reg, %xmm8; \
- pandn CASE_ADD_reg, %xmm7; \
- pandn CASE_ADD_reg, %xmm8; \
- paddb %xmm7, reg1; \
- paddb %xmm8, reg2
-
- TOLOWER (%xmm1, %xmm2)
-#else
-# define TOLOWER(reg1, reg2)
-#endif
- pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
- pcmpeqb %xmm1, %xmm0 /* Any null chars? */
- pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %edx
- sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
- jnz LABEL(less16bytes)/* If not, find different value or null char */
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)/* finish comparison */
-#endif
- add $16, %rsi /* prepare to search next 16 bytes */
- add $16, %rdi /* prepare to search next 16 bytes */
-
- /*
- * Determine source and destination string offsets from 16-byte
- * alignment. Use relative offset difference between the two to
- * determine which case below to use.
- */
- .p2align 4
-LABEL(crosscache):
- and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
- and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
- mov $0xffff, %edx /* for equivalent offset */
- xor %r8d, %r8d
- and $0xf, %ecx /* offset of rsi */
- and $0xf, %eax /* offset of rdi */
- pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
- cmp %eax, %ecx
- je LABEL(ashr_0) /* rsi and rdi relative offset same */
- ja LABEL(bigger)
- mov %edx, %r8d /* r8d is offset flag for exit tail */
- xchg %ecx, %eax
- xchg %rsi, %rdi
-LABEL(bigger):
- movdqa (%rdi), %xmm2
- movdqa (%rsi), %xmm1
- lea 15(%rax), %r9
- sub %rcx, %r9
- lea LABEL(unaligned_table)(%rip), %r10
- movslq (%r10, %r9,4), %r9
- pcmpeqb %xmm1, %xmm0 /* Any null chars? */
- lea (%r10, %r9), %r10
- _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
-
-/*
- * The following cases will be handled by ashr_0
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(0~15) n(0~15) 15(15+ n-n) ashr_0
- */
- .p2align 4
-LABEL(ashr_0):
-
- movdqa (%rsi), %xmm1
- pcmpeqb %xmm1, %xmm0 /* Any null chars? */
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
-#else
- movdqa (%rdi), %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
-#endif
- psubb %xmm0, %xmm1 /* packed sub of comparison results*/
- pmovmskb %xmm1, %r9d
- shr %cl, %edx /* adjust 0xffff for offset */
- shr %cl, %r9d /* adjust for 16-byte offset */
- sub %r9d, %edx
- /*
- * edx must be the same with r9d if in left byte (16-rcx) is equal to
- * the start from (16-rax) and no null char was seen.
- */
- jne LABEL(less32bytes) /* mismatch or null char */
- UPDATE_STRNCMP_COUNTER
- mov $16, %rcx
- mov $16, %r9
-
- /*
- * Now both strings are aligned at 16-byte boundary. Loop over strings
- * checking 32-bytes per iteration.
- */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
- .p2align 4
-LABEL(ashr_0_use):
- movdqa (%rdi,%rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- lea 16(%rdx), %rdx
- jbe LABEL(ashr_0_exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- movdqa (%rdi,%rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- lea 16(%rdx), %rdx
- jbe LABEL(ashr_0_exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- jmp LABEL(ashr_0_use)
-
-
- .p2align 4
-LABEL(ashr_0_exit_use):
- jnc LABEL(strcmp_exitz)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub %rcx, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- lea -16(%rdx, %rcx), %rcx
- movzbl (%rdi, %rcx), %eax
- movzbl (%rsi, %rcx), %edx
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
- movl (%rcx,%rax,4), %eax
- movl (%rcx,%rdx,4), %edx
-#endif
- sub %edx, %eax
- ret
-
-
-
-/*
- * The following cases will be handled by ashr_1
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(15) n -15 0(15 +(n-15) - n) ashr_1
- */
- .p2align 4
-LABEL(ashr_1):
- pslldq $15, %xmm2 /* shift first string to align with second */
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
- psubb %xmm0, %xmm2 /* packed sub of comparison results*/
- pmovmskb %xmm2, %r9d
- shr %cl, %edx /* adjust 0xffff for offset */
- shr %cl, %r9d /* adjust for 16-byte offset */
- sub %r9d, %edx
- jnz LABEL(less32bytes) /* mismatch or null char seen */
- movdqa (%rdi), %xmm3
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads*/
- mov $1, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 1(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_1_use):
- add $16, %r10
- jg LABEL(nibble_ashr_1_use)
-
-LABEL(nibble_ashr_1_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $1, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_1_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $1, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_1_use)
-
- .p2align 4
-LABEL(nibble_ashr_1_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $1, %xmm0
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $14, %ecx
- ja LABEL(nibble_ashr_1_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_2
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
- */
- .p2align 4
-LABEL(ashr_2):
- pslldq $14, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $2, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 2(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_2_use):
- add $16, %r10
- jg LABEL(nibble_ashr_2_use)
-
-LABEL(nibble_ashr_2_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $2, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_2_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $2, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_2_use)
-
- .p2align 4
-LABEL(nibble_ashr_2_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $2, %xmm0
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $13, %ecx
- ja LABEL(nibble_ashr_2_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_3
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
- */
- .p2align 4
-LABEL(ashr_3):
- pslldq $13, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $3, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 3(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
-LABEL(loop_ashr_3_use):
- add $16, %r10
- jg LABEL(nibble_ashr_3_use)
-
-LABEL(nibble_ashr_3_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $3, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_3_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $3, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_3_use)
-
- .p2align 4
-LABEL(nibble_ashr_3_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $3, %xmm0
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $12, %ecx
- ja LABEL(nibble_ashr_3_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_4
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
- */
- .p2align 4
-LABEL(ashr_4):
- pslldq $12, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $4, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 4(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_4_use):
- add $16, %r10
- jg LABEL(nibble_ashr_4_use)
-
-LABEL(nibble_ashr_4_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $4, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_4_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $4, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_4_use)
-
- .p2align 4
-LABEL(nibble_ashr_4_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $4, %xmm0
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $11, %ecx
- ja LABEL(nibble_ashr_4_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_5
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
- */
- .p2align 4
-LABEL(ashr_5):
- pslldq $11, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $5, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 5(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_5_use):
- add $16, %r10
- jg LABEL(nibble_ashr_5_use)
-
-LABEL(nibble_ashr_5_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $5, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_5_use)
-
- movdqa (%rdi, %rdx), %xmm0
-
- palignr $5, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_5_use)
-
- .p2align 4
-LABEL(nibble_ashr_5_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $5, %xmm0
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $10, %ecx
- ja LABEL(nibble_ashr_5_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_6
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
- */
- .p2align 4
-LABEL(ashr_6):
- pslldq $10, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $6, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 6(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_6_use):
- add $16, %r10
- jg LABEL(nibble_ashr_6_use)
-
-LABEL(nibble_ashr_6_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $6, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_6_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $6, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_6_use)
-
- .p2align 4
-LABEL(nibble_ashr_6_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $6, %xmm0
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $9, %ecx
- ja LABEL(nibble_ashr_6_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_7
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
- */
- .p2align 4
-LABEL(ashr_7):
- pslldq $9, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $7, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 7(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_7_use):
- add $16, %r10
- jg LABEL(nibble_ashr_7_use)
-
-LABEL(nibble_ashr_7_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $7, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_7_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $7, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_7_use)
-
- .p2align 4
-LABEL(nibble_ashr_7_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $7, %xmm0
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $8, %ecx
- ja LABEL(nibble_ashr_7_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_8
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
- */
- .p2align 4
-LABEL(ashr_8):
- pslldq $8, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $8, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 8(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_8_use):
- add $16, %r10
- jg LABEL(nibble_ashr_8_use)
-
-LABEL(nibble_ashr_8_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $8, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_8_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $8, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_8_use)
-
- .p2align 4
-LABEL(nibble_ashr_8_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $8, %xmm0
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $7, %ecx
- ja LABEL(nibble_ashr_8_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_9
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
- */
- .p2align 4
-LABEL(ashr_9):
- pslldq $7, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $9, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 9(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_9_use):
- add $16, %r10
- jg LABEL(nibble_ashr_9_use)
-
-LABEL(nibble_ashr_9_restart_use):
- movdqa (%rdi, %rdx), %xmm0
-
- palignr $9, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_9_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $9, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_9_use)
-
- .p2align 4
-LABEL(nibble_ashr_9_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $9, %xmm0
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $6, %ecx
- ja LABEL(nibble_ashr_9_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_10
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
- */
- .p2align 4
-LABEL(ashr_10):
- pslldq $6, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $10, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 10(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_10_use):
- add $16, %r10
- jg LABEL(nibble_ashr_10_use)
-
-LABEL(nibble_ashr_10_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $10, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_10_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $10, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_10_use)
-
- .p2align 4
-LABEL(nibble_ashr_10_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $10, %xmm0
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $5, %ecx
- ja LABEL(nibble_ashr_10_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_11
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
- */
- .p2align 4
-LABEL(ashr_11):
- pslldq $5, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $11, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 11(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_11_use):
- add $16, %r10
- jg LABEL(nibble_ashr_11_use)
-
-LABEL(nibble_ashr_11_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $11, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_11_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $11, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_11_use)
-
- .p2align 4
-LABEL(nibble_ashr_11_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $11, %xmm0
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $4, %ecx
- ja LABEL(nibble_ashr_11_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_12
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
- */
- .p2align 4
-LABEL(ashr_12):
- pslldq $4, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $12, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 12(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_12_use):
- add $16, %r10
- jg LABEL(nibble_ashr_12_use)
-
-LABEL(nibble_ashr_12_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $12, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_12_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $12, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_12_use)
-
- .p2align 4
-LABEL(nibble_ashr_12_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $12, %xmm0
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $3, %ecx
- ja LABEL(nibble_ashr_12_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_13
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
- */
- .p2align 4
-LABEL(ashr_13):
- pslldq $3, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $13, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 13(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_13_use):
- add $16, %r10
- jg LABEL(nibble_ashr_13_use)
-
-LABEL(nibble_ashr_13_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $13, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_13_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $13, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_13_use)
-
- .p2align 4
-LABEL(nibble_ashr_13_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $13, %xmm0
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $2, %ecx
- ja LABEL(nibble_ashr_13_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_14
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
- */
- .p2align 4
-LABEL(ashr_14):
- pslldq $2, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $14, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 14(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_14_use):
- add $16, %r10
- jg LABEL(nibble_ashr_14_use)
-
-LABEL(nibble_ashr_14_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $14, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_14_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $14, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_14_use)
-
- .p2align 4
-LABEL(nibble_ashr_14_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $14, %xmm0
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $1, %ecx
- ja LABEL(nibble_ashr_14_restart_use)
-
- jmp LABEL(nibble_ashr_exit_use)
-
-/*
- * The following cases will be handled by ashr_15
- * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
- * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
- */
- .p2align 4
-LABEL(ashr_15):
- pslldq $1, %xmm2
- TOLOWER (%xmm1, %xmm2)
- pcmpeqb %xmm1, %xmm2
- psubb %xmm0, %xmm2
- pmovmskb %xmm2, %r9d
- shr %cl, %edx
- shr %cl, %r9d
- sub %r9d, %edx
- jnz LABEL(less32bytes)
-
- movdqa (%rdi), %xmm3
-
- UPDATE_STRNCMP_COUNTER
-
- mov $16, %rcx /* index for loads */
- mov $15, %r9d /* byte position left over from less32bytes case */
- /*
- * Setup %r10 value allows us to detect crossing a page boundary.
- * When %r10 goes positive we have crossed a page boundary and
- * need to do a nibble.
- */
- lea 15(%rdi), %r10
- and $0xfff, %r10 /* offset into 4K page */
-
- sub $0x1000, %r10 /* subtract 4K pagesize */
-
- mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
-
- .p2align 4
-LABEL(loop_ashr_15_use):
- add $16, %r10
- jg LABEL(nibble_ashr_15_use)
-
-LABEL(nibble_ashr_15_restart_use):
- movdqa (%rdi, %rdx), %xmm0
- palignr $15, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
-
- add $16, %rdx
- add $16, %r10
- jg LABEL(nibble_ashr_15_use)
-
- movdqa (%rdi, %rdx), %xmm0
- palignr $15, -16(%rdi, %rdx), %xmm0
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a, (%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- jbe LABEL(exit_use)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub $16, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add $16, %rdx
- jmp LABEL(loop_ashr_15_use)
-
- .p2align 4
-LABEL(nibble_ashr_15_use):
- sub $0x1000, %r10
- movdqa -16(%rdi, %rdx), %xmm0
- psrldq $15, %xmm0
- pcmpistri $0x3a,%xmm0, %xmm0
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- cmp %r11, %rcx
- jae LABEL(nibble_ashr_exit_use)
-#endif
- cmp $0, %ecx
- ja LABEL(nibble_ashr_15_restart_use)
-
-LABEL(nibble_ashr_exit_use):
-#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
- pcmpistri $0x1a,(%rsi,%rdx), %xmm0
-#else
- movdqa (%rsi,%rdx), %xmm1
- TOLOWER (%xmm0, %xmm1)
- pcmpistri $0x1a, %xmm1, %xmm0
-#endif
- .p2align 4
-LABEL(exit_use):
- jnc LABEL(strcmp_exitz)
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub %rcx, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- add %rcx, %rdx
- lea -16(%rdi, %r9), %rdi
- movzbl (%rdi, %rdx), %eax
- movzbl (%rsi, %rdx), %edx
- test %r8d, %r8d
- jz LABEL(ret_use)
- xchg %eax, %edx
-LABEL(ret_use):
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
- movl (%rcx,%rdx,4), %edx
- movl (%rcx,%rax,4), %eax
-#endif
-
- sub %edx, %eax
- ret
-
-LABEL(less32bytes):
- lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
- lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
- test %r8d, %r8d
- jz LABEL(ret)
- xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
-
- .p2align 4
-LABEL(ret):
-LABEL(less16bytes):
- bsf %rdx, %rdx /* find and store bit index in %rdx */
-
-#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- sub %rdx, %r11
- jbe LABEL(strcmp_exitz)
-#endif
- movzbl (%rsi, %rdx), %ecx
- movzbl (%rdi, %rdx), %eax
-
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
- movl (%rdx,%rcx,4), %ecx
- movl (%rdx,%rax,4), %eax
-#endif
-
- sub %ecx, %eax
- ret
-
-LABEL(strcmp_exitz):
- xor %eax, %eax
- ret
-
- .p2align 4
- // XXX Same as code above
-LABEL(Byte0):
- movzbl (%rsi), %ecx
- movzbl (%rdi), %eax
-
-#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
- leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
- movl (%rdx,%rcx,4), %ecx
- movl (%rdx,%rax,4), %eax
-#endif
-
- sub %ecx, %eax
- ret
- cfi_endproc
- .size STRCMP_SSE42, .-STRCMP_SSE42
-
-#undef UCLOW_reg
-#undef UCHIGH_reg
-#undef LCQWORD_reg
-#undef TOLOWER
-
- /* Put all SSE 4.2 functions together. */
- .section .rodata.SECTION,"a",@progbits
- .p2align 3
-LABEL(unaligned_table):
- .int LABEL(ashr_1) - LABEL(unaligned_table)
- .int LABEL(ashr_2) - LABEL(unaligned_table)
- .int LABEL(ashr_3) - LABEL(unaligned_table)
- .int LABEL(ashr_4) - LABEL(unaligned_table)
- .int LABEL(ashr_5) - LABEL(unaligned_table)
- .int LABEL(ashr_6) - LABEL(unaligned_table)
- .int LABEL(ashr_7) - LABEL(unaligned_table)
- .int LABEL(ashr_8) - LABEL(unaligned_table)
- .int LABEL(ashr_9) - LABEL(unaligned_table)
- .int LABEL(ashr_10) - LABEL(unaligned_table)
- .int LABEL(ashr_11) - LABEL(unaligned_table)
- .int LABEL(ashr_12) - LABEL(unaligned_table)
- .int LABEL(ashr_13) - LABEL(unaligned_table)
- .int LABEL(ashr_14) - LABEL(unaligned_table)
- .int LABEL(ashr_15) - LABEL(unaligned_table)
- .int LABEL(ashr_0) - LABEL(unaligned_table)
-
-#undef LABEL
-#undef GLABEL
-#undef SECTION
-#undef movdqa
-#undef movdqu
-#undef pmovmskb
-#undef pcmpistri
-#undef psubb
-#undef pcmpeqb
-#undef psrldq
-#undef pslldq
-#undef palignr
-#undef pxor
-#undef D
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse4_2.S b/sysdeps/x86_64/multiarch/strcmp-sse4_2.S
index 2c916bafa0..963e208ccb 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse4_2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse4_2.S
@@ -17,5 +17,1766 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# include "strcmp-sse42.S"
+# include <sysdep.h>
+
+# define STRCMP_ISA _sse42
+# include "strcmp-naming.h"
+
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# include "locale-defines.h"
+# endif
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
+ if the new counter > the old one or is 0. */
+# define UPDATE_STRNCMP_COUNTER \
+ /* calculate left number to compare */ \
+ lea -16(%rcx, %r11), %r9; \
+ cmp %r9, %r11; \
+ jb LABEL(strcmp_exitz); \
+ test %r9, %r9; \
+ je LABEL(strcmp_exitz); \
+ mov %r9, %r11
+# else
+# define UPDATE_STRNCMP_COUNTER
+# endif
+
+# define SECTION sse4.2
+
+# define LABEL(l) .L##l
+
+/* We use 0x1a:
+ _SIDD_SBYTE_OPS
+ | _SIDD_CMP_EQUAL_EACH
+ | _SIDD_NEGATIVE_POLARITY
+ | _SIDD_LEAST_SIGNIFICANT
+ on pcmpistri to find out if two 16byte data elements are the same
+ and the offset of the first different byte. There are 4 cases:
+
+ 1. Both 16byte data elements are valid and identical.
+ 2. Both 16byte data elements have EOS and identical.
+ 3. Both 16byte data elements are valid and they differ at offset X.
+ 4. At least one 16byte data element has EOS at offset X. Two 16byte
+ data elements must differ at or before offset X.
+
+ Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
+
+ case ECX CFlag ZFlag SFlag
+ 1 16 0 0 0
+ 2 16 0 1 1
+ 3 X 1 0 0
+ 4 0 <= X 1 0/1 0/1
+
+ We exit from the loop for cases 2, 3 and 4 with jbe which branches
+ when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
+ case 2. */
+
+ /* Put all SSE 4.2 functions together. */
+ .section .text.SECTION,"ax",@progbits
+ .align 16
+ .type STRCMP, @function
+ .globl STRCMP
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (STRCASECMP)
+ movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
+ mov %fs:(%rax),%RDX_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END (STRCASECMP)
+ /* FALLTHROUGH to strcasecmp_l. */
+# endif
+# ifdef USE_AS_STRNCASECMP_L
+ENTRY (STRCASECMP)
+ movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
+ mov %fs:(%rax),%RCX_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END (STRCASECMP)
+ /* FALLTHROUGH to strncasecmp_l. */
+# endif
+
+
+# define arg arg
+
+STRCMP:
+ cfi_startproc
+ _CET_ENDBR
+ CALL_MCOUNT
+
+/*
+ * This implementation uses SSE to compare up to 16 bytes at a time.
+ */
+# ifdef USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales
+ with encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
+# else
+ mov (%rdx), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
+ jne __strcasecmp_l_nonascii
+# endif
+# ifdef USE_AS_STRNCASECMP_L
+ /* We have to fall back on the C implementation for locales
+ with encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
+# else
+ mov (%rcx), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
+ jne __strncasecmp_l_nonascii
+# endif
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ test %RDX_LP, %RDX_LP
+ je LABEL(strcmp_exitz)
+ cmp $1, %RDX_LP
+ je LABEL(Byte0)
+ mov %RDX_LP, %R11_LP
+# endif
+ mov %esi, %ecx
+ mov %edi, %eax
+/* Use 64bit AND here to avoid long NOP padding. */
+ and $0x3f, %rcx /* rsi alignment in cache line */
+ and $0x3f, %rax /* rdi alignment in cache line */
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ .section .rodata.cst16,"aM",@progbits,16
+ .align 16
+LABEL(lcase_min):
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+LABEL(lcase_max):
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+LABEL(case_add):
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+ movdqa LABEL(lcase_min)(%rip), %xmm4
+# define LCASE_MIN_reg %xmm4
+ movdqa LABEL(lcase_max)(%rip), %xmm5
+# define LCASE_MAX_reg %xmm5
+ movdqa LABEL(case_add)(%rip), %xmm6
+# define CASE_ADD_reg %xmm6
+# endif
+ cmp $0x30, %ecx
+ ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
+ cmp $0x30, %eax
+ ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
+ movdqu (%rdi), %xmm1
+ movdqu (%rsi), %xmm2
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+# define TOLOWER(reg1, reg2) \
+ movdqa LCASE_MIN_reg, %xmm7; \
+ movdqa LCASE_MIN_reg, %xmm8; \
+ paddb reg1, %xmm7; \
+ paddb reg2, %xmm8; \
+ pcmpgtb LCASE_MAX_reg, %xmm7; \
+ pcmpgtb LCASE_MAX_reg, %xmm8; \
+ pandn CASE_ADD_reg, %xmm7; \
+ pandn CASE_ADD_reg, %xmm8; \
+ paddb %xmm7, reg1; \
+ paddb %xmm8, reg2
+
+ TOLOWER (%xmm1, %xmm2)
+# else
+# define TOLOWER(reg1, reg2)
+# endif
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %edx
+ sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
+ jnz LABEL(less16bytes)/* If not, find different value or null char */
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)/* finish comparison */
+# endif
+ add $16, %rsi /* prepare to search next 16 bytes */
+ add $16, %rdi /* prepare to search next 16 bytes */
+
+ /*
+ * Determine source and destination string offsets from 16-byte
+ * alignment. Use relative offset difference between the two to
+ * determine which case below to use.
+ */
+ .p2align 4
+LABEL(crosscache):
+ and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
+ and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
+ mov $0xffff, %edx /* for equivalent offset */
+ xor %r8d, %r8d
+ and $0xf, %ecx /* offset of rsi */
+ and $0xf, %eax /* offset of rdi */
+ pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
+ cmp %eax, %ecx
+ je LABEL(ashr_0) /* rsi and rdi relative offset same */
+ ja LABEL(bigger)
+ mov %edx, %r8d /* r8d is offset flag for exit tail */
+ xchg %ecx, %eax
+ xchg %rsi, %rdi
+LABEL(bigger):
+ movdqa (%rdi), %xmm2
+ movdqa (%rsi), %xmm1
+ lea 15(%rax), %r9
+ sub %rcx, %r9
+ lea LABEL(unaligned_table)(%rip), %r10
+ movslq (%r10, %r9,4), %r9
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+ lea (%r10, %r9), %r10
+ _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
+
+/*
+ * The following cases will be handled by ashr_0
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(0~15) n(0~15) 15(15+ n-n) ashr_0
+ */
+ .p2align 4
+LABEL(ashr_0):
+
+ movdqa (%rsi), %xmm1
+ pcmpeqb %xmm1, %xmm0 /* Any null chars? */
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
+# else
+ movdqa (%rdi), %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
+# endif
+ psubb %xmm0, %xmm1 /* packed sub of comparison results*/
+ pmovmskb %xmm1, %r9d
+ shr %cl, %edx /* adjust 0xffff for offset */
+ shr %cl, %r9d /* adjust for 16-byte offset */
+ sub %r9d, %edx
+ /*
+ * edx must be the same with r9d if in left byte (16-rcx) is equal to
+ * the start from (16-rax) and no null char was seen.
+ */
+ jne LABEL(less32bytes) /* mismatch or null char */
+ UPDATE_STRNCMP_COUNTER
+ mov $16, %rcx
+ mov $16, %r9
+
+ /*
+ * Now both strings are aligned at 16-byte boundary. Loop over strings
+ * checking 32-bytes per iteration.
+ */
+ mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
+ .p2align 4
+LABEL(ashr_0_use):
+ movdqa (%rdi,%rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ lea 16(%rdx), %rdx
+ jbe LABEL(ashr_0_exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ movdqa (%rdi,%rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ lea 16(%rdx), %rdx
+ jbe LABEL(ashr_0_exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ jmp LABEL(ashr_0_use)
+
+
+ .p2align 4
+LABEL(ashr_0_exit_use):
+ jnc LABEL(strcmp_exitz)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub %rcx, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ lea -16(%rdx, %rcx), %rcx
+ movzbl (%rdi, %rcx), %eax
+ movzbl (%rsi, %rcx), %edx
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
+ movl (%rcx,%rax,4), %eax
+ movl (%rcx,%rdx,4), %edx
+# endif
+ sub %edx, %eax
+ ret
+
+
+
+/*
+ * The following cases will be handled by ashr_1
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(15) n -15 0(15 +(n-15) - n) ashr_1
+ */
+ .p2align 4
+LABEL(ashr_1):
+ pslldq $15, %xmm2 /* shift first string to align with second */
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
+ psubb %xmm0, %xmm2 /* packed sub of comparison results*/
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx /* adjust 0xffff for offset */
+ shr %cl, %r9d /* adjust for 16-byte offset */
+ sub %r9d, %edx
+ jnz LABEL(less32bytes) /* mismatch or null char seen */
+ movdqa (%rdi), %xmm3
+ UPDATE_STRNCMP_COUNTER
+
+ mov $16, %rcx /* index for loads*/
+ mov $1, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 1(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
+
+ .p2align 4
+LABEL(loop_ashr_1_use):
+ add $16, %r10
+ jg LABEL(nibble_ashr_1_use)
+
+LABEL(nibble_ashr_1_restart_use):
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $1, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rdx
+ add $16, %r10
+ jg LABEL(nibble_ashr_1_use)
+
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $1, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rdx
+ jmp LABEL(loop_ashr_1_use)
+
+ .p2align 4
+LABEL(nibble_ashr_1_use):
+ sub $0x1000, %r10
+ movdqa -16(%rdi, %rdx), %xmm0
+ psrldq $1, %xmm0
+ pcmpistri $0x3a,%xmm0, %xmm0
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp %r11, %rcx
+ jae LABEL(nibble_ashr_exit_use)
+# endif
+ cmp $14, %ecx
+ ja LABEL(nibble_ashr_1_restart_use)
+
+ jmp LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_2
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
+ */
+ .p2align 4
+LABEL(ashr_2):
+ pslldq $14, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+ UPDATE_STRNCMP_COUNTER
+
+ mov $16, %rcx /* index for loads */
+ mov $2, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 2(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
+
+ .p2align 4
+LABEL(loop_ashr_2_use):
+ add $16, %r10
+ jg LABEL(nibble_ashr_2_use)
+
+LABEL(nibble_ashr_2_restart_use):
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $2, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rdx
+ add $16, %r10
+ jg LABEL(nibble_ashr_2_use)
+
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $2, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rdx
+ jmp LABEL(loop_ashr_2_use)
+
+ .p2align 4
+LABEL(nibble_ashr_2_use):
+ sub $0x1000, %r10
+ movdqa -16(%rdi, %rdx), %xmm0
+ psrldq $2, %xmm0
+ pcmpistri $0x3a,%xmm0, %xmm0
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp %r11, %rcx
+ jae LABEL(nibble_ashr_exit_use)
+# endif
+ cmp $13, %ecx
+ ja LABEL(nibble_ashr_2_restart_use)
+
+ jmp LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_3
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
+ */
+ .p2align 4
+LABEL(ashr_3):
+ pslldq $13, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ mov $16, %rcx /* index for loads */
+ mov $3, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 3(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
+
+LABEL(loop_ashr_3_use):
+ add $16, %r10
+ jg LABEL(nibble_ashr_3_use)
+
+LABEL(nibble_ashr_3_restart_use):
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $3, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rdx
+ add $16, %r10
+ jg LABEL(nibble_ashr_3_use)
+
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $3, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rdx
+ jmp LABEL(loop_ashr_3_use)
+
+ .p2align 4
+LABEL(nibble_ashr_3_use):
+ sub $0x1000, %r10
+ movdqa -16(%rdi, %rdx), %xmm0
+ psrldq $3, %xmm0
+ pcmpistri $0x3a,%xmm0, %xmm0
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp %r11, %rcx
+ jae LABEL(nibble_ashr_exit_use)
+# endif
+ cmp $12, %ecx
+ ja LABEL(nibble_ashr_3_restart_use)
+
+ jmp LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_4
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
+ */
+ .p2align 4
+LABEL(ashr_4):
+ pslldq $12, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ mov $16, %rcx /* index for loads */
+ mov $4, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 4(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
+
+ .p2align 4
+LABEL(loop_ashr_4_use):
+ add $16, %r10
+ jg LABEL(nibble_ashr_4_use)
+
+LABEL(nibble_ashr_4_restart_use):
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $4, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rdx
+ add $16, %r10
+ jg LABEL(nibble_ashr_4_use)
+
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $4, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rdx
+ jmp LABEL(loop_ashr_4_use)
+
+ .p2align 4
+LABEL(nibble_ashr_4_use):
+ sub $0x1000, %r10
+ movdqa -16(%rdi, %rdx), %xmm0
+ psrldq $4, %xmm0
+ pcmpistri $0x3a,%xmm0, %xmm0
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp %r11, %rcx
+ jae LABEL(nibble_ashr_exit_use)
+# endif
+ cmp $11, %ecx
+ ja LABEL(nibble_ashr_4_restart_use)
+
+ jmp LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_5
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
+ */
+ .p2align 4
+LABEL(ashr_5):
+ pslldq $11, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ mov $16, %rcx /* index for loads */
+ mov $5, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 5(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
+
+ .p2align 4
+LABEL(loop_ashr_5_use):
+ add $16, %r10
+ jg LABEL(nibble_ashr_5_use)
+
+LABEL(nibble_ashr_5_restart_use):
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $5, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rdx
+ add $16, %r10
+ jg LABEL(nibble_ashr_5_use)
+
+ movdqa (%rdi, %rdx), %xmm0
+
+ palignr $5, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rdx
+ jmp LABEL(loop_ashr_5_use)
+
+ .p2align 4
+LABEL(nibble_ashr_5_use):
+ sub $0x1000, %r10
+ movdqa -16(%rdi, %rdx), %xmm0
+ psrldq $5, %xmm0
+ pcmpistri $0x3a,%xmm0, %xmm0
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp %r11, %rcx
+ jae LABEL(nibble_ashr_exit_use)
+# endif
+ cmp $10, %ecx
+ ja LABEL(nibble_ashr_5_restart_use)
+
+ jmp LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_6
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
+ */
+ .p2align 4
+LABEL(ashr_6):
+ pslldq $10, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ mov $16, %rcx /* index for loads */
+ mov $6, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 6(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
+
+ .p2align 4
+LABEL(loop_ashr_6_use):
+ add $16, %r10
+ jg LABEL(nibble_ashr_6_use)
+
+LABEL(nibble_ashr_6_restart_use):
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $6, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rdx
+ add $16, %r10
+ jg LABEL(nibble_ashr_6_use)
+
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $6, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rdx
+ jmp LABEL(loop_ashr_6_use)
+
+ .p2align 4
+LABEL(nibble_ashr_6_use):
+ sub $0x1000, %r10
+ movdqa -16(%rdi, %rdx), %xmm0
+ psrldq $6, %xmm0
+ pcmpistri $0x3a,%xmm0, %xmm0
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp %r11, %rcx
+ jae LABEL(nibble_ashr_exit_use)
+# endif
+ cmp $9, %ecx
+ ja LABEL(nibble_ashr_6_restart_use)
+
+ jmp LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_7
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
+ */
+ .p2align 4
+LABEL(ashr_7):
+ pslldq $9, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ mov $16, %rcx /* index for loads */
+ mov $7, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 7(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
+
+ .p2align 4
+LABEL(loop_ashr_7_use):
+ add $16, %r10
+ jg LABEL(nibble_ashr_7_use)
+
+LABEL(nibble_ashr_7_restart_use):
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $7, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rdx
+ add $16, %r10
+ jg LABEL(nibble_ashr_7_use)
+
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $7, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rdx
+ jmp LABEL(loop_ashr_7_use)
+
+ .p2align 4
+LABEL(nibble_ashr_7_use):
+ sub $0x1000, %r10
+ movdqa -16(%rdi, %rdx), %xmm0
+ psrldq $7, %xmm0
+ pcmpistri $0x3a,%xmm0, %xmm0
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp %r11, %rcx
+ jae LABEL(nibble_ashr_exit_use)
+# endif
+ cmp $8, %ecx
+ ja LABEL(nibble_ashr_7_restart_use)
+
+ jmp LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_8
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
+ */
+ .p2align 4
+LABEL(ashr_8):
+ pslldq $8, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ mov $16, %rcx /* index for loads */
+ mov $8, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 8(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
+
+ .p2align 4
+LABEL(loop_ashr_8_use):
+ add $16, %r10
+ jg LABEL(nibble_ashr_8_use)
+
+LABEL(nibble_ashr_8_restart_use):
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $8, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rdx
+ add $16, %r10
+ jg LABEL(nibble_ashr_8_use)
+
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $8, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rdx
+ jmp LABEL(loop_ashr_8_use)
+
+ .p2align 4
+LABEL(nibble_ashr_8_use):
+ sub $0x1000, %r10
+ movdqa -16(%rdi, %rdx), %xmm0
+ psrldq $8, %xmm0
+ pcmpistri $0x3a,%xmm0, %xmm0
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp %r11, %rcx
+ jae LABEL(nibble_ashr_exit_use)
+# endif
+ cmp $7, %ecx
+ ja LABEL(nibble_ashr_8_restart_use)
+
+ jmp LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_9
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
+ */
+ .p2align 4
+LABEL(ashr_9):
+ pslldq $7, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ mov $16, %rcx /* index for loads */
+ mov $9, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 9(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
+
+ .p2align 4
+LABEL(loop_ashr_9_use):
+ add $16, %r10
+ jg LABEL(nibble_ashr_9_use)
+
+LABEL(nibble_ashr_9_restart_use):
+ movdqa (%rdi, %rdx), %xmm0
+
+ palignr $9, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rdx
+ add $16, %r10
+ jg LABEL(nibble_ashr_9_use)
+
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $9, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rdx
+ jmp LABEL(loop_ashr_9_use)
+
+ .p2align 4
+LABEL(nibble_ashr_9_use):
+ sub $0x1000, %r10
+ movdqa -16(%rdi, %rdx), %xmm0
+ psrldq $9, %xmm0
+ pcmpistri $0x3a,%xmm0, %xmm0
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp %r11, %rcx
+ jae LABEL(nibble_ashr_exit_use)
+# endif
+ cmp $6, %ecx
+ ja LABEL(nibble_ashr_9_restart_use)
+
+ jmp LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_10
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
+ */
+ .p2align 4
+LABEL(ashr_10):
+ pslldq $6, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ mov $16, %rcx /* index for loads */
+ mov $10, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 10(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
+
+ .p2align 4
+LABEL(loop_ashr_10_use):
+ add $16, %r10
+ jg LABEL(nibble_ashr_10_use)
+
+LABEL(nibble_ashr_10_restart_use):
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $10, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rdx
+ add $16, %r10
+ jg LABEL(nibble_ashr_10_use)
+
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $10, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rdx
+ jmp LABEL(loop_ashr_10_use)
+
+ .p2align 4
+LABEL(nibble_ashr_10_use):
+ sub $0x1000, %r10
+ movdqa -16(%rdi, %rdx), %xmm0
+ psrldq $10, %xmm0
+ pcmpistri $0x3a,%xmm0, %xmm0
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp %r11, %rcx
+ jae LABEL(nibble_ashr_exit_use)
+# endif
+ cmp $5, %ecx
+ ja LABEL(nibble_ashr_10_restart_use)
+
+ jmp LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_11
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
+ */
+ .p2align 4
+LABEL(ashr_11):
+ pslldq $5, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ mov $16, %rcx /* index for loads */
+ mov $11, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 11(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
+
+ .p2align 4
+LABEL(loop_ashr_11_use):
+ add $16, %r10
+ jg LABEL(nibble_ashr_11_use)
+
+LABEL(nibble_ashr_11_restart_use):
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $11, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rdx
+ add $16, %r10
+ jg LABEL(nibble_ashr_11_use)
+
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $11, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rdx
+ jmp LABEL(loop_ashr_11_use)
+
+ .p2align 4
+LABEL(nibble_ashr_11_use):
+ sub $0x1000, %r10
+ movdqa -16(%rdi, %rdx), %xmm0
+ psrldq $11, %xmm0
+ pcmpistri $0x3a,%xmm0, %xmm0
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp %r11, %rcx
+ jae LABEL(nibble_ashr_exit_use)
+# endif
+ cmp $4, %ecx
+ ja LABEL(nibble_ashr_11_restart_use)
+
+ jmp LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_12
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
+ */
+ .p2align 4
+LABEL(ashr_12):
+ pslldq $4, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ mov $16, %rcx /* index for loads */
+ mov $12, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 12(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+ mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
+
+ .p2align 4
+LABEL(loop_ashr_12_use):
+ add $16, %r10
+ jg LABEL(nibble_ashr_12_use)
+
+LABEL(nibble_ashr_12_restart_use):
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $12, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rdx
+ add $16, %r10
+ jg LABEL(nibble_ashr_12_use)
+
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $12, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rdx
+ jmp LABEL(loop_ashr_12_use)
+
+ .p2align 4
+LABEL(nibble_ashr_12_use):
+ sub $0x1000, %r10
+ movdqa -16(%rdi, %rdx), %xmm0
+ psrldq $12, %xmm0
+ pcmpistri $0x3a,%xmm0, %xmm0
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp %r11, %rcx
+ jae LABEL(nibble_ashr_exit_use)
+# endif
+ cmp $3, %ecx
+ ja LABEL(nibble_ashr_12_restart_use)
+
+ jmp LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_13
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
+ */
+ .p2align 4
+LABEL(ashr_13):
+ pslldq $3, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ mov $16, %rcx /* index for loads */
+ mov $13, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 13(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
+
+ .p2align 4
+LABEL(loop_ashr_13_use):
+ add $16, %r10
+ jg LABEL(nibble_ashr_13_use)
+
+LABEL(nibble_ashr_13_restart_use):
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $13, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rdx
+ add $16, %r10
+ jg LABEL(nibble_ashr_13_use)
+
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $13, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rdx
+ jmp LABEL(loop_ashr_13_use)
+
+ .p2align 4
+LABEL(nibble_ashr_13_use):
+ sub $0x1000, %r10
+ movdqa -16(%rdi, %rdx), %xmm0
+ psrldq $13, %xmm0
+ pcmpistri $0x3a,%xmm0, %xmm0
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp %r11, %rcx
+ jae LABEL(nibble_ashr_exit_use)
+# endif
+ cmp $2, %ecx
+ ja LABEL(nibble_ashr_13_restart_use)
+
+ jmp LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_14
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
+ */
+ .p2align 4
+LABEL(ashr_14):
+ pslldq $2, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ mov $16, %rcx /* index for loads */
+ mov $14, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 14(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
+
+ .p2align 4
+LABEL(loop_ashr_14_use):
+ add $16, %r10
+ jg LABEL(nibble_ashr_14_use)
+
+LABEL(nibble_ashr_14_restart_use):
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $14, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rdx
+ add $16, %r10
+ jg LABEL(nibble_ashr_14_use)
+
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $14, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rdx
+ jmp LABEL(loop_ashr_14_use)
+
+ .p2align 4
+LABEL(nibble_ashr_14_use):
+ sub $0x1000, %r10
+ movdqa -16(%rdi, %rdx), %xmm0
+ psrldq $14, %xmm0
+ pcmpistri $0x3a,%xmm0, %xmm0
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp %r11, %rcx
+ jae LABEL(nibble_ashr_exit_use)
+# endif
+ cmp $1, %ecx
+ ja LABEL(nibble_ashr_14_restart_use)
+
+ jmp LABEL(nibble_ashr_exit_use)
+
+/*
+ * The following cases will be handled by ashr_15
+ * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
+ * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
+ */
+ .p2align 4
+LABEL(ashr_15):
+ pslldq $1, %xmm2
+ TOLOWER (%xmm1, %xmm2)
+ pcmpeqb %xmm1, %xmm2
+ psubb %xmm0, %xmm2
+ pmovmskb %xmm2, %r9d
+ shr %cl, %edx
+ shr %cl, %r9d
+ sub %r9d, %edx
+ jnz LABEL(less32bytes)
+
+ movdqa (%rdi), %xmm3
+
+ UPDATE_STRNCMP_COUNTER
+
+ mov $16, %rcx /* index for loads */
+ mov $15, %r9d /* byte position left over from less32bytes case */
+ /*
+ * Setup %r10 value allows us to detect crossing a page boundary.
+ * When %r10 goes positive we have crossed a page boundary and
+ * need to do a nibble.
+ */
+ lea 15(%rdi), %r10
+ and $0xfff, %r10 /* offset into 4K page */
+
+ sub $0x1000, %r10 /* subtract 4K pagesize */
+
+ mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
+
+ .p2align 4
+LABEL(loop_ashr_15_use):
+ add $16, %r10
+ jg LABEL(nibble_ashr_15_use)
+
+LABEL(nibble_ashr_15_restart_use):
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $15, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+
+ add $16, %rdx
+ add $16, %r10
+ jg LABEL(nibble_ashr_15_use)
+
+ movdqa (%rdi, %rdx), %xmm0
+ palignr $15, -16(%rdi, %rdx), %xmm0
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a, (%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ jbe LABEL(exit_use)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub $16, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add $16, %rdx
+ jmp LABEL(loop_ashr_15_use)
+
+ .p2align 4
+LABEL(nibble_ashr_15_use):
+ sub $0x1000, %r10
+ movdqa -16(%rdi, %rdx), %xmm0
+ psrldq $15, %xmm0
+ pcmpistri $0x3a,%xmm0, %xmm0
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ cmp %r11, %rcx
+ jae LABEL(nibble_ashr_exit_use)
+# endif
+ cmp $0, %ecx
+ ja LABEL(nibble_ashr_15_restart_use)
+
+LABEL(nibble_ashr_exit_use):
+# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
+ pcmpistri $0x1a,(%rsi,%rdx), %xmm0
+# else
+ movdqa (%rsi,%rdx), %xmm1
+ TOLOWER (%xmm0, %xmm1)
+ pcmpistri $0x1a, %xmm1, %xmm0
+# endif
+ .p2align 4
+LABEL(exit_use):
+ jnc LABEL(strcmp_exitz)
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub %rcx, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ add %rcx, %rdx
+ lea -16(%rdi, %r9), %rdi
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ test %r8d, %r8d
+ jz LABEL(ret_use)
+ xchg %eax, %edx
+LABEL(ret_use):
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
+ movl (%rcx,%rdx,4), %edx
+ movl (%rcx,%rax,4), %eax
+# endif
+
+ sub %edx, %eax
+ ret
+
+LABEL(less32bytes):
+ lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
+ lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
+ test %r8d, %r8d
+ jz LABEL(ret)
+ xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
+
+ .p2align 4
+LABEL(ret):
+LABEL(less16bytes):
+ bsf %rdx, %rdx /* find and store bit index in %rdx */
+
+# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+ sub %rdx, %r11
+ jbe LABEL(strcmp_exitz)
+# endif
+ movzbl (%rsi, %rdx), %ecx
+ movzbl (%rdi, %rdx), %eax
+
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+ movl (%rdx,%rcx,4), %ecx
+ movl (%rdx,%rax,4), %eax
+# endif
+
+ sub %ecx, %eax
+ ret
+
+LABEL(strcmp_exitz):
+ xor %eax, %eax
+ ret
+
+ .p2align 4
+ // XXX Same as code above
+LABEL(Byte0):
+ movzbl (%rsi), %ecx
+ movzbl (%rdi), %eax
+
+# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
+ leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+ movl (%rdx,%rcx,4), %ecx
+ movl (%rdx,%rax,4), %eax
+# endif
+
+ sub %ecx, %eax
+ ret
+ cfi_endproc
+ .size STRCMP, .-STRCMP
+
+# undef UCLOW_reg
+# undef UCHIGH_reg
+# undef LCQWORD_reg
+# undef TOLOWER
+
+ /* Put all SSE 4.2 functions together. */
+ .section .rodata.SECTION,"a",@progbits
+ .p2align 3
+LABEL(unaligned_table):
+ .int LABEL(ashr_1) - LABEL(unaligned_table)
+ .int LABEL(ashr_2) - LABEL(unaligned_table)
+ .int LABEL(ashr_3) - LABEL(unaligned_table)
+ .int LABEL(ashr_4) - LABEL(unaligned_table)
+ .int LABEL(ashr_5) - LABEL(unaligned_table)
+ .int LABEL(ashr_6) - LABEL(unaligned_table)
+ .int LABEL(ashr_7) - LABEL(unaligned_table)
+ .int LABEL(ashr_8) - LABEL(unaligned_table)
+ .int LABEL(ashr_9) - LABEL(unaligned_table)
+ .int LABEL(ashr_10) - LABEL(unaligned_table)
+ .int LABEL(ashr_11) - LABEL(unaligned_table)
+ .int LABEL(ashr_12) - LABEL(unaligned_table)
+ .int LABEL(ashr_13) - LABEL(unaligned_table)
+ .int LABEL(ashr_14) - LABEL(unaligned_table)
+ .int LABEL(ashr_15) - LABEL(unaligned_table)
+ .int LABEL(ashr_0) - LABEL(unaligned_table)
+
+# undef LABEL
+# undef GLABEL
+# undef SECTION
+# undef movdqa
+# undef movdqu
+# undef pmovmskb
+# undef pcmpistri
+# undef psubb
+# undef pcmpeqb
+# undef psrldq
+# undef pslldq
+# undef palignr
+# undef pxor
+# undef D
#endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S b/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S
index 08e23548c3..1ce5c4e93f 100644
--- a/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S
+++ b/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S
@@ -16,6 +16,5 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#define STRCMP_SSE42 __strncasecmp_l_sse42
#define USE_AS_STRNCASECMP_L
-#include "strcmp-sse42.S"
+#include "strcmp-sse4_2.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S
index 310a6dbe77..2a02f0c2a6 100644
--- a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S
+++ b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S
@@ -16,8 +16,5 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#if IS_IN (libc)
-# define STRCMP_SSE42 __strncmp_sse42
-# define USE_AS_STRNCMP
-# include "strcmp-sse42.S"
-#endif
+#define USE_AS_STRNCMP
+#include "strcmp-sse4_2.S"
--
2.34.1
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v1 4/4] x86: Move strcmp SSE42 implementation to multiarch/strcmp-sse4_2.S
2022-07-12 19:28 ` [PATCH v1 4/4] x86: Move strcmp SSE42 implementation to multiarch/strcmp-sse4_2.S Noah Goldstein
@ 2022-07-13 0:07 ` H.J. Lu
0 siblings, 0 replies; 8+ messages in thread
From: H.J. Lu @ 2022-07-13 0:07 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jul 12, 2022 at 12:28 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
> .../x86_64/multiarch/strcasecmp_l-sse4_2.S | 3 +-
> sysdeps/x86_64/multiarch/strcmp-sse42.S | 1782 -----------------
> sysdeps/x86_64/multiarch/strcmp-sse4_2.S | 1763 +++++++++++++++-
> sysdeps/x86_64/multiarch/strncase_l-sse4_2.S | 3 +-
> sysdeps/x86_64/multiarch/strncmp-sse4_2.S | 7 +-
> 5 files changed, 1766 insertions(+), 1792 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/strcmp-sse42.S
>
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S
> index 411ab7d283..ac03b95756 100644
> --- a/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S
> +++ b/sysdeps/x86_64/multiarch/strcasecmp_l-sse4_2.S
> @@ -16,6 +16,5 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#define STRCMP_SSE42 __strcasecmp_l_sse42
> #define USE_AS_STRCASECMP_L
> -#include "strcmp-sse42.S"
> +#include "strcmp-sse4_2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
> deleted file mode 100644
> index 60313c647a..0000000000
> --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
> +++ /dev/null
> @@ -1,1782 +0,0 @@
> -/* strcmp with SSE4.2
> - Copyright (C) 2009-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -
> -#ifndef STRCMP_SSE42
> -# define STRCMP_SSE42 __strcmp_sse42
> -#endif
> -
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> -# include "locale-defines.h"
> -#endif
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> -/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
> - if the new counter > the old one or is 0. */
> -# define UPDATE_STRNCMP_COUNTER \
> - /* calculate left number to compare */ \
> - lea -16(%rcx, %r11), %r9; \
> - cmp %r9, %r11; \
> - jb LABEL(strcmp_exitz); \
> - test %r9, %r9; \
> - je LABEL(strcmp_exitz); \
> - mov %r9, %r11
> -#else
> -# define UPDATE_STRNCMP_COUNTER
> -#endif
> -
> -#define SECTION sse4.2
> -#define GLABEL(l) l##_sse42
> -
> -#define LABEL(l) .L##l
> -
> -/* We use 0x1a:
> - _SIDD_SBYTE_OPS
> - | _SIDD_CMP_EQUAL_EACH
> - | _SIDD_NEGATIVE_POLARITY
> - | _SIDD_LEAST_SIGNIFICANT
> - on pcmpistri to find out if two 16byte data elements are the same
> - and the offset of the first different byte. There are 4 cases:
> -
> - 1. Both 16byte data elements are valid and identical.
> - 2. Both 16byte data elements have EOS and identical.
> - 3. Both 16byte data elements are valid and they differ at offset X.
> - 4. At least one 16byte data element has EOS at offset X. Two 16byte
> - data elements must differ at or before offset X.
> -
> - Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
> -
> - case ECX CFlag ZFlag SFlag
> - 1 16 0 0 0
> - 2 16 0 1 1
> - 3 X 1 0 0
> - 4 0 <= X 1 0/1 0/1
> -
> - We exit from the loop for cases 2, 3 and 4 with jbe which branches
> - when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
> - case 2. */
> -
> - /* Put all SSE 4.2 functions together. */
> - .section .text.SECTION,"ax",@progbits
> - .align 16
> - .type STRCMP_SSE42, @function
> - .globl STRCMP_SSE42
> -#ifdef USE_AS_STRCASECMP_L
> -ENTRY (GLABEL(__strcasecmp))
> - movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> - mov %fs:(%rax),%RDX_LP
> -
> - /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> - .p2align 4
> -END (GLABEL(__strcasecmp))
> - /* FALLTHROUGH to strcasecmp_l. */
> -#endif
> -#ifdef USE_AS_STRNCASECMP_L
> -ENTRY (GLABEL(__strncasecmp))
> - movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> - mov %fs:(%rax),%RCX_LP
> -
> - /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> - .p2align 4
> -END (GLABEL(__strncasecmp))
> - /* FALLTHROUGH to strncasecmp_l. */
> -#endif
> -
> -
> -#define arg arg
> -
> -STRCMP_SSE42:
> - cfi_startproc
> - _CET_ENDBR
> - CALL_MCOUNT
> -
> -/*
> - * This implementation uses SSE to compare up to 16 bytes at a time.
> - */
> -#ifdef USE_AS_STRCASECMP_L
> - /* We have to fall back on the C implementation for locales
> - with encodings not matching ASCII for single bytes. */
> -# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> - mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
> -# else
> - mov (%rdx), %RAX_LP
> -# endif
> - testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
> - jne __strcasecmp_l_nonascii
> -#endif
> -#ifdef USE_AS_STRNCASECMP_L
> - /* We have to fall back on the C implementation for locales
> - with encodings not matching ASCII for single bytes. */
> -# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> - mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
> -# else
> - mov (%rcx), %RAX_LP
> -# endif
> - testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
> - jne __strncasecmp_l_nonascii
> -#endif
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - test %RDX_LP, %RDX_LP
> - je LABEL(strcmp_exitz)
> - cmp $1, %RDX_LP
> - je LABEL(Byte0)
> - mov %RDX_LP, %R11_LP
> -#endif
> - mov %esi, %ecx
> - mov %edi, %eax
> -/* Use 64bit AND here to avoid long NOP padding. */
> - and $0x3f, %rcx /* rsi alignment in cache line */
> - and $0x3f, %rax /* rdi alignment in cache line */
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> - .section .rodata.cst16,"aM",@progbits,16
> - .align 16
> -LABEL(lcase_min):
> - .quad 0x3f3f3f3f3f3f3f3f
> - .quad 0x3f3f3f3f3f3f3f3f
> -LABEL(lcase_max):
> - .quad 0x9999999999999999
> - .quad 0x9999999999999999
> -LABEL(case_add):
> - .quad 0x2020202020202020
> - .quad 0x2020202020202020
> - .previous
> - movdqa LABEL(lcase_min)(%rip), %xmm4
> -# define LCASE_MIN_reg %xmm4
> - movdqa LABEL(lcase_max)(%rip), %xmm5
> -# define LCASE_MAX_reg %xmm5
> - movdqa LABEL(case_add)(%rip), %xmm6
> -# define CASE_ADD_reg %xmm6
> -#endif
> - cmp $0x30, %ecx
> - ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
> - cmp $0x30, %eax
> - ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
> - movdqu (%rdi), %xmm1
> - movdqu (%rsi), %xmm2
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> -# define TOLOWER(reg1, reg2) \
> - movdqa LCASE_MIN_reg, %xmm7; \
> - movdqa LCASE_MIN_reg, %xmm8; \
> - paddb reg1, %xmm7; \
> - paddb reg2, %xmm8; \
> - pcmpgtb LCASE_MAX_reg, %xmm7; \
> - pcmpgtb LCASE_MAX_reg, %xmm8; \
> - pandn CASE_ADD_reg, %xmm7; \
> - pandn CASE_ADD_reg, %xmm8; \
> - paddb %xmm7, reg1; \
> - paddb %xmm8, reg2
> -
> - TOLOWER (%xmm1, %xmm2)
> -#else
> -# define TOLOWER(reg1, reg2)
> -#endif
> - pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
> - pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> - pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %edx
> - sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
> - jnz LABEL(less16bytes)/* If not, find different value or null char */
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)/* finish comparison */
> -#endif
> - add $16, %rsi /* prepare to search next 16 bytes */
> - add $16, %rdi /* prepare to search next 16 bytes */
> -
> - /*
> - * Determine source and destination string offsets from 16-byte
> - * alignment. Use relative offset difference between the two to
> - * determine which case below to use.
> - */
> - .p2align 4
> -LABEL(crosscache):
> - and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
> - and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
> - mov $0xffff, %edx /* for equivalent offset */
> - xor %r8d, %r8d
> - and $0xf, %ecx /* offset of rsi */
> - and $0xf, %eax /* offset of rdi */
> - pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
> - cmp %eax, %ecx
> - je LABEL(ashr_0) /* rsi and rdi relative offset same */
> - ja LABEL(bigger)
> - mov %edx, %r8d /* r8d is offset flag for exit tail */
> - xchg %ecx, %eax
> - xchg %rsi, %rdi
> -LABEL(bigger):
> - movdqa (%rdi), %xmm2
> - movdqa (%rsi), %xmm1
> - lea 15(%rax), %r9
> - sub %rcx, %r9
> - lea LABEL(unaligned_table)(%rip), %r10
> - movslq (%r10, %r9,4), %r9
> - pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> - lea (%r10, %r9), %r10
> - _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
> -
> -/*
> - * The following cases will be handled by ashr_0
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(0~15) n(0~15) 15(15+ n-n) ashr_0
> - */
> - .p2align 4
> -LABEL(ashr_0):
> -
> - movdqa (%rsi), %xmm1
> - pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
> -#else
> - movdqa (%rdi), %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
> -#endif
> - psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> - pmovmskb %xmm1, %r9d
> - shr %cl, %edx /* adjust 0xffff for offset */
> - shr %cl, %r9d /* adjust for 16-byte offset */
> - sub %r9d, %edx
> - /*
> - * edx must be the same with r9d if in left byte (16-rcx) is equal to
> - * the start from (16-rax) and no null char was seen.
> - */
> - jne LABEL(less32bytes) /* mismatch or null char */
> - UPDATE_STRNCMP_COUNTER
> - mov $16, %rcx
> - mov $16, %r9
> -
> - /*
> - * Now both strings are aligned at 16-byte boundary. Loop over strings
> - * checking 32-bytes per iteration.
> - */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> - .p2align 4
> -LABEL(ashr_0_use):
> - movdqa (%rdi,%rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - lea 16(%rdx), %rdx
> - jbe LABEL(ashr_0_exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - movdqa (%rdi,%rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - lea 16(%rdx), %rdx
> - jbe LABEL(ashr_0_exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - jmp LABEL(ashr_0_use)
> -
> -
> - .p2align 4
> -LABEL(ashr_0_exit_use):
> - jnc LABEL(strcmp_exitz)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub %rcx, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - lea -16(%rdx, %rcx), %rcx
> - movzbl (%rdi, %rcx), %eax
> - movzbl (%rsi, %rcx), %edx
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
> - movl (%rcx,%rax,4), %eax
> - movl (%rcx,%rdx,4), %edx
> -#endif
> - sub %edx, %eax
> - ret
> -
> -
> -
> -/*
> - * The following cases will be handled by ashr_1
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(15) n -15 0(15 +(n-15) - n) ashr_1
> - */
> - .p2align 4
> -LABEL(ashr_1):
> - pslldq $15, %xmm2 /* shift first string to align with second */
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
> - psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx /* adjust 0xffff for offset */
> - shr %cl, %r9d /* adjust for 16-byte offset */
> - sub %r9d, %edx
> - jnz LABEL(less32bytes) /* mismatch or null char seen */
> - movdqa (%rdi), %xmm3
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads*/
> - mov $1, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 1(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_1_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_1_use)
> -
> -LABEL(nibble_ashr_1_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $1, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_1_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $1, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_1_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_1_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $1, %xmm0
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $14, %ecx
> - ja LABEL(nibble_ashr_1_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_2
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
> - */
> - .p2align 4
> -LABEL(ashr_2):
> - pslldq $14, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $2, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 2(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_2_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_2_use)
> -
> -LABEL(nibble_ashr_2_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $2, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_2_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $2, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_2_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_2_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $2, %xmm0
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $13, %ecx
> - ja LABEL(nibble_ashr_2_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_3
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
> - */
> - .p2align 4
> -LABEL(ashr_3):
> - pslldq $13, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $3, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 3(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> -LABEL(loop_ashr_3_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_3_use)
> -
> -LABEL(nibble_ashr_3_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $3, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_3_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $3, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_3_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_3_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $3, %xmm0
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $12, %ecx
> - ja LABEL(nibble_ashr_3_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_4
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
> - */
> - .p2align 4
> -LABEL(ashr_4):
> - pslldq $12, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $4, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 4(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_4_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_4_use)
> -
> -LABEL(nibble_ashr_4_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $4, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_4_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $4, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_4_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_4_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $4, %xmm0
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $11, %ecx
> - ja LABEL(nibble_ashr_4_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_5
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
> - */
> - .p2align 4
> -LABEL(ashr_5):
> - pslldq $11, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $5, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 5(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_5_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_5_use)
> -
> -LABEL(nibble_ashr_5_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $5, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_5_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> -
> - palignr $5, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_5_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_5_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $5, %xmm0
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $10, %ecx
> - ja LABEL(nibble_ashr_5_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_6
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
> - */
> - .p2align 4
> -LABEL(ashr_6):
> - pslldq $10, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $6, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 6(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_6_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_6_use)
> -
> -LABEL(nibble_ashr_6_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $6, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_6_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $6, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_6_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_6_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $6, %xmm0
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $9, %ecx
> - ja LABEL(nibble_ashr_6_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_7
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
> - */
> - .p2align 4
> -LABEL(ashr_7):
> - pslldq $9, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $7, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 7(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_7_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_7_use)
> -
> -LABEL(nibble_ashr_7_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $7, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_7_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $7, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_7_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_7_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $7, %xmm0
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $8, %ecx
> - ja LABEL(nibble_ashr_7_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_8
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
> - */
> - .p2align 4
> -LABEL(ashr_8):
> - pslldq $8, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $8, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 8(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_8_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_8_use)
> -
> -LABEL(nibble_ashr_8_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $8, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_8_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $8, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_8_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_8_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $8, %xmm0
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $7, %ecx
> - ja LABEL(nibble_ashr_8_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_9
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
> - */
> - .p2align 4
> -LABEL(ashr_9):
> - pslldq $7, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $9, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 9(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_9_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_9_use)
> -
> -LABEL(nibble_ashr_9_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> -
> - palignr $9, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_9_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $9, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_9_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_9_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $9, %xmm0
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $6, %ecx
> - ja LABEL(nibble_ashr_9_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_10
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
> - */
> - .p2align 4
> -LABEL(ashr_10):
> - pslldq $6, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $10, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 10(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_10_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_10_use)
> -
> -LABEL(nibble_ashr_10_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $10, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_10_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $10, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_10_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_10_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $10, %xmm0
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $5, %ecx
> - ja LABEL(nibble_ashr_10_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_11
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
> - */
> - .p2align 4
> -LABEL(ashr_11):
> - pslldq $5, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $11, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 11(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_11_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_11_use)
> -
> -LABEL(nibble_ashr_11_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $11, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_11_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $11, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_11_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_11_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $11, %xmm0
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $4, %ecx
> - ja LABEL(nibble_ashr_11_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_12
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
> - */
> - .p2align 4
> -LABEL(ashr_12):
> - pslldq $4, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $12, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 12(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_12_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_12_use)
> -
> -LABEL(nibble_ashr_12_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $12, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_12_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $12, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_12_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_12_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $12, %xmm0
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $3, %ecx
> - ja LABEL(nibble_ashr_12_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_13
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
> - */
> - .p2align 4
> -LABEL(ashr_13):
> - pslldq $3, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $13, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 13(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_13_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_13_use)
> -
> -LABEL(nibble_ashr_13_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $13, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_13_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $13, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_13_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_13_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $13, %xmm0
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $2, %ecx
> - ja LABEL(nibble_ashr_13_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_14
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
> - */
> - .p2align 4
> -LABEL(ashr_14):
> - pslldq $2, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $14, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 14(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_14_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_14_use)
> -
> -LABEL(nibble_ashr_14_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $14, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_14_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $14, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_14_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_14_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $14, %xmm0
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $1, %ecx
> - ja LABEL(nibble_ashr_14_restart_use)
> -
> - jmp LABEL(nibble_ashr_exit_use)
> -
> -/*
> - * The following cases will be handled by ashr_15
> - * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> - * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
> - */
> - .p2align 4
> -LABEL(ashr_15):
> - pslldq $1, %xmm2
> - TOLOWER (%xmm1, %xmm2)
> - pcmpeqb %xmm1, %xmm2
> - psubb %xmm0, %xmm2
> - pmovmskb %xmm2, %r9d
> - shr %cl, %edx
> - shr %cl, %r9d
> - sub %r9d, %edx
> - jnz LABEL(less32bytes)
> -
> - movdqa (%rdi), %xmm3
> -
> - UPDATE_STRNCMP_COUNTER
> -
> - mov $16, %rcx /* index for loads */
> - mov $15, %r9d /* byte position left over from less32bytes case */
> - /*
> - * Setup %r10 value allows us to detect crossing a page boundary.
> - * When %r10 goes positive we have crossed a page boundary and
> - * need to do a nibble.
> - */
> - lea 15(%rdi), %r10
> - and $0xfff, %r10 /* offset into 4K page */
> -
> - sub $0x1000, %r10 /* subtract 4K pagesize */
> -
> - mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> -
> - .p2align 4
> -LABEL(loop_ashr_15_use):
> - add $16, %r10
> - jg LABEL(nibble_ashr_15_use)
> -
> -LABEL(nibble_ashr_15_restart_use):
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $15, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> -
> - add $16, %rdx
> - add $16, %r10
> - jg LABEL(nibble_ashr_15_use)
> -
> - movdqa (%rdi, %rdx), %xmm0
> - palignr $15, -16(%rdi, %rdx), %xmm0
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - jbe LABEL(exit_use)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub $16, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add $16, %rdx
> - jmp LABEL(loop_ashr_15_use)
> -
> - .p2align 4
> -LABEL(nibble_ashr_15_use):
> - sub $0x1000, %r10
> - movdqa -16(%rdi, %rdx), %xmm0
> - psrldq $15, %xmm0
> - pcmpistri $0x3a,%xmm0, %xmm0
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - cmp %r11, %rcx
> - jae LABEL(nibble_ashr_exit_use)
> -#endif
> - cmp $0, %ecx
> - ja LABEL(nibble_ashr_15_restart_use)
> -
> -LABEL(nibble_ashr_exit_use):
> -#if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> - pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> -#else
> - movdqa (%rsi,%rdx), %xmm1
> - TOLOWER (%xmm0, %xmm1)
> - pcmpistri $0x1a, %xmm1, %xmm0
> -#endif
> - .p2align 4
> -LABEL(exit_use):
> - jnc LABEL(strcmp_exitz)
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub %rcx, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - add %rcx, %rdx
> - lea -16(%rdi, %r9), %rdi
> - movzbl (%rdi, %rdx), %eax
> - movzbl (%rsi, %rdx), %edx
> - test %r8d, %r8d
> - jz LABEL(ret_use)
> - xchg %eax, %edx
> -LABEL(ret_use):
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
> - movl (%rcx,%rdx,4), %edx
> - movl (%rcx,%rax,4), %eax
> -#endif
> -
> - sub %edx, %eax
> - ret
> -
> -LABEL(less32bytes):
> - lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
> - lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
> - test %r8d, %r8d
> - jz LABEL(ret)
> - xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
> -
> - .p2align 4
> -LABEL(ret):
> -LABEL(less16bytes):
> - bsf %rdx, %rdx /* find and store bit index in %rdx */
> -
> -#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> - sub %rdx, %r11
> - jbe LABEL(strcmp_exitz)
> -#endif
> - movzbl (%rsi, %rdx), %ecx
> - movzbl (%rdi, %rdx), %eax
> -
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
> - movl (%rdx,%rcx,4), %ecx
> - movl (%rdx,%rax,4), %eax
> -#endif
> -
> - sub %ecx, %eax
> - ret
> -
> -LABEL(strcmp_exitz):
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> - // XXX Same as code above
> -LABEL(Byte0):
> - movzbl (%rsi), %ecx
> - movzbl (%rdi), %eax
> -
> -#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> - leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
> - movl (%rdx,%rcx,4), %ecx
> - movl (%rdx,%rax,4), %eax
> -#endif
> -
> - sub %ecx, %eax
> - ret
> - cfi_endproc
> - .size STRCMP_SSE42, .-STRCMP_SSE42
> -
> -#undef UCLOW_reg
> -#undef UCHIGH_reg
> -#undef LCQWORD_reg
> -#undef TOLOWER
> -
> - /* Put all SSE 4.2 functions together. */
> - .section .rodata.SECTION,"a",@progbits
> - .p2align 3
> -LABEL(unaligned_table):
> - .int LABEL(ashr_1) - LABEL(unaligned_table)
> - .int LABEL(ashr_2) - LABEL(unaligned_table)
> - .int LABEL(ashr_3) - LABEL(unaligned_table)
> - .int LABEL(ashr_4) - LABEL(unaligned_table)
> - .int LABEL(ashr_5) - LABEL(unaligned_table)
> - .int LABEL(ashr_6) - LABEL(unaligned_table)
> - .int LABEL(ashr_7) - LABEL(unaligned_table)
> - .int LABEL(ashr_8) - LABEL(unaligned_table)
> - .int LABEL(ashr_9) - LABEL(unaligned_table)
> - .int LABEL(ashr_10) - LABEL(unaligned_table)
> - .int LABEL(ashr_11) - LABEL(unaligned_table)
> - .int LABEL(ashr_12) - LABEL(unaligned_table)
> - .int LABEL(ashr_13) - LABEL(unaligned_table)
> - .int LABEL(ashr_14) - LABEL(unaligned_table)
> - .int LABEL(ashr_15) - LABEL(unaligned_table)
> - .int LABEL(ashr_0) - LABEL(unaligned_table)
> -
> -#undef LABEL
> -#undef GLABEL
> -#undef SECTION
> -#undef movdqa
> -#undef movdqu
> -#undef pmovmskb
> -#undef pcmpistri
> -#undef psubb
> -#undef pcmpeqb
> -#undef psrldq
> -#undef pslldq
> -#undef palignr
> -#undef pxor
> -#undef D
> diff --git a/sysdeps/x86_64/multiarch/strcmp-sse4_2.S b/sysdeps/x86_64/multiarch/strcmp-sse4_2.S
> index 2c916bafa0..963e208ccb 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-sse4_2.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-sse4_2.S
> @@ -17,5 +17,1766 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# include "strcmp-sse42.S"
> +# include <sysdep.h>
> +
> +# define STRCMP_ISA _sse42
> +# include "strcmp-naming.h"
> +
> +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> +# include "locale-defines.h"
> +# endif
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz
> + if the new counter > the old one or is 0. */
> +# define UPDATE_STRNCMP_COUNTER \
> + /* calculate left number to compare */ \
> + lea -16(%rcx, %r11), %r9; \
> + cmp %r9, %r11; \
> + jb LABEL(strcmp_exitz); \
> + test %r9, %r9; \
> + je LABEL(strcmp_exitz); \
> + mov %r9, %r11
> +# else
> +# define UPDATE_STRNCMP_COUNTER
> +# endif
> +
> +# define SECTION sse4.2
> +
> +# define LABEL(l) .L##l
> +
> +/* We use 0x1a:
> + _SIDD_SBYTE_OPS
> + | _SIDD_CMP_EQUAL_EACH
> + | _SIDD_NEGATIVE_POLARITY
> + | _SIDD_LEAST_SIGNIFICANT
> + on pcmpistri to find out if two 16byte data elements are the same
> + and the offset of the first different byte. There are 4 cases:
> +
> + 1. Both 16byte data elements are valid and identical.
> + 2. Both 16byte data elements have EOS and identical.
> + 3. Both 16byte data elements are valid and they differ at offset X.
> + 4. At least one 16byte data element has EOS at offset X. Two 16byte
> + data elements must differ at or before offset X.
> +
> + Here is the table of ECX, CFlag, ZFlag and SFlag for 4 cases:
> +
> + case ECX CFlag ZFlag SFlag
> + 1 16 0 0 0
> + 2 16 0 1 1
> + 3 X 1 0 0
> + 4 0 <= X 1 0/1 0/1
> +
> + We exit from the loop for cases 2, 3 and 4 with jbe which branches
> + when either CFlag or ZFlag is 1. If CFlag == 0, we return 0 for
> + case 2. */
> +
> + /* Put all SSE 4.2 functions together. */
> + .section .text.SECTION,"ax",@progbits
> + .align 16
> + .type STRCMP, @function
> + .globl STRCMP
> +# ifdef USE_AS_STRCASECMP_L
> +ENTRY (STRCASECMP)
> + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> + mov %fs:(%rax),%RDX_LP
> +
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> +END (STRCASECMP)
> + /* FALLTHROUGH to strcasecmp_l. */
> +# endif
> +# ifdef USE_AS_STRNCASECMP_L
> +ENTRY (STRCASECMP)
> + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax
> + mov %fs:(%rax),%RCX_LP
> +
> + /* Either 1 or 5 bytes (dependeing if CET is enabled). */
> + .p2align 4
> +END (STRCASECMP)
> + /* FALLTHROUGH to strncasecmp_l. */
> +# endif
> +
> +
> +# define arg arg
> +
> +STRCMP:
> + cfi_startproc
> + _CET_ENDBR
> + CALL_MCOUNT
> +
> +/*
> + * This implementation uses SSE to compare up to 16 bytes at a time.
> + */
> +# ifdef USE_AS_STRCASECMP_L
> + /* We have to fall back on the C implementation for locales
> + with encodings not matching ASCII for single bytes. */
> +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rdx), %RAX_LP
> +# else
> + mov (%rdx), %RAX_LP
> +# endif
> + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
> + jne __strcasecmp_l_nonascii
> +# endif
> +# ifdef USE_AS_STRNCASECMP_L
> + /* We have to fall back on the C implementation for locales
> + with encodings not matching ASCII for single bytes. */
> +# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
> + mov LOCALE_T___LOCALES+LC_CTYPE*LP_SIZE(%rcx), %RAX_LP
> +# else
> + mov (%rcx), %RAX_LP
> +# endif
> + testl $1, LOCALE_DATA_VALUES+_NL_CTYPE_NONASCII_CASE*SIZEOF_VALUES(%rax)
> + jne __strncasecmp_l_nonascii
> +# endif
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + test %RDX_LP, %RDX_LP
> + je LABEL(strcmp_exitz)
> + cmp $1, %RDX_LP
> + je LABEL(Byte0)
> + mov %RDX_LP, %R11_LP
> +# endif
> + mov %esi, %ecx
> + mov %edi, %eax
> +/* Use 64bit AND here to avoid long NOP padding. */
> + and $0x3f, %rcx /* rsi alignment in cache line */
> + and $0x3f, %rax /* rdi alignment in cache line */
> +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> + .section .rodata.cst16,"aM",@progbits,16
> + .align 16
> +LABEL(lcase_min):
> + .quad 0x3f3f3f3f3f3f3f3f
> + .quad 0x3f3f3f3f3f3f3f3f
> +LABEL(lcase_max):
> + .quad 0x9999999999999999
> + .quad 0x9999999999999999
> +LABEL(case_add):
> + .quad 0x2020202020202020
> + .quad 0x2020202020202020
> + .previous
> + movdqa LABEL(lcase_min)(%rip), %xmm4
> +# define LCASE_MIN_reg %xmm4
> + movdqa LABEL(lcase_max)(%rip), %xmm5
> +# define LCASE_MAX_reg %xmm5
> + movdqa LABEL(case_add)(%rip), %xmm6
> +# define CASE_ADD_reg %xmm6
> +# endif
> + cmp $0x30, %ecx
> + ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
> + cmp $0x30, %eax
> + ja LABEL(crosscache)/* rdi: 16-byte load will cross cache line */
> + movdqu (%rdi), %xmm1
> + movdqu (%rsi), %xmm2
> +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> +# define TOLOWER(reg1, reg2) \
> + movdqa LCASE_MIN_reg, %xmm7; \
> + movdqa LCASE_MIN_reg, %xmm8; \
> + paddb reg1, %xmm7; \
> + paddb reg2, %xmm8; \
> + pcmpgtb LCASE_MAX_reg, %xmm7; \
> + pcmpgtb LCASE_MAX_reg, %xmm8; \
> + pandn CASE_ADD_reg, %xmm7; \
> + pandn CASE_ADD_reg, %xmm8; \
> + paddb %xmm7, reg1; \
> + paddb %xmm8, reg2
> +
> + TOLOWER (%xmm1, %xmm2)
> +# else
> +# define TOLOWER(reg1, reg2)
> +# endif
> + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */
> + pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> + pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %edx
> + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */
> + jnz LABEL(less16bytes)/* If not, find different value or null char */
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)/* finish comparison */
> +# endif
> + add $16, %rsi /* prepare to search next 16 bytes */
> + add $16, %rdi /* prepare to search next 16 bytes */
> +
> + /*
> + * Determine source and destination string offsets from 16-byte
> + * alignment. Use relative offset difference between the two to
> + * determine which case below to use.
> + */
> + .p2align 4
> +LABEL(crosscache):
> + and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */
> + and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */
> + mov $0xffff, %edx /* for equivalent offset */
> + xor %r8d, %r8d
> + and $0xf, %ecx /* offset of rsi */
> + and $0xf, %eax /* offset of rdi */
> + pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */
> + cmp %eax, %ecx
> + je LABEL(ashr_0) /* rsi and rdi relative offset same */
> + ja LABEL(bigger)
> + mov %edx, %r8d /* r8d is offset flag for exit tail */
> + xchg %ecx, %eax
> + xchg %rsi, %rdi
> +LABEL(bigger):
> + movdqa (%rdi), %xmm2
> + movdqa (%rsi), %xmm1
> + lea 15(%rax), %r9
> + sub %rcx, %r9
> + lea LABEL(unaligned_table)(%rip), %r10
> + movslq (%r10, %r9,4), %r9
> + pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> + lea (%r10, %r9), %r10
> + _CET_NOTRACK jmp *%r10 /* jump to corresponding case */
> +
> +/*
> + * The following cases will be handled by ashr_0
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(0~15) n(0~15) 15(15+ n-n) ashr_0
> + */
> + .p2align 4
> +LABEL(ashr_0):
> +
> + movdqa (%rsi), %xmm1
> + pcmpeqb %xmm1, %xmm0 /* Any null chars? */
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */
> +# else
> + movdqa (%rdi), %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */
> +# endif
> + psubb %xmm0, %xmm1 /* packed sub of comparison results*/
> + pmovmskb %xmm1, %r9d
> + shr %cl, %edx /* adjust 0xffff for offset */
> + shr %cl, %r9d /* adjust for 16-byte offset */
> + sub %r9d, %edx
> + /*
> + * edx must be the same with r9d if in left byte (16-rcx) is equal to
> + * the start from (16-rax) and no null char was seen.
> + */
> + jne LABEL(less32bytes) /* mismatch or null char */
> + UPDATE_STRNCMP_COUNTER
> + mov $16, %rcx
> + mov $16, %r9
> +
> + /*
> + * Now both strings are aligned at 16-byte boundary. Loop over strings
> + * checking 32-bytes per iteration.
> + */
> + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> + .p2align 4
> +LABEL(ashr_0_use):
> + movdqa (%rdi,%rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + lea 16(%rdx), %rdx
> + jbe LABEL(ashr_0_exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + movdqa (%rdi,%rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + lea 16(%rdx), %rdx
> + jbe LABEL(ashr_0_exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + jmp LABEL(ashr_0_use)
> +
> +
> + .p2align 4
> +LABEL(ashr_0_exit_use):
> + jnc LABEL(strcmp_exitz)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub %rcx, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + lea -16(%rdx, %rcx), %rcx
> + movzbl (%rdi, %rcx), %eax
> + movzbl (%rsi, %rcx), %edx
> +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
> + movl (%rcx,%rax,4), %eax
> + movl (%rcx,%rdx,4), %edx
> +# endif
> + sub %edx, %eax
> + ret
> +
> +
> +
> +/*
> + * The following cases will be handled by ashr_1
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(15) n -15 0(15 +(n-15) - n) ashr_1
> + */
> + .p2align 4
> +LABEL(ashr_1):
> + pslldq $15, %xmm2 /* shift first string to align with second */
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */
> + psubb %xmm0, %xmm2 /* packed sub of comparison results*/
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx /* adjust 0xffff for offset */
> + shr %cl, %r9d /* adjust for 16-byte offset */
> + sub %r9d, %edx
> + jnz LABEL(less32bytes) /* mismatch or null char seen */
> + movdqa (%rdi), %xmm3
> + UPDATE_STRNCMP_COUNTER
> +
> + mov $16, %rcx /* index for loads*/
> + mov $1, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 1(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> +
> + .p2align 4
> +LABEL(loop_ashr_1_use):
> + add $16, %r10
> + jg LABEL(nibble_ashr_1_use)
> +
> +LABEL(nibble_ashr_1_restart_use):
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $1, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rdx
> + add $16, %r10
> + jg LABEL(nibble_ashr_1_use)
> +
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $1, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rdx
> + jmp LABEL(loop_ashr_1_use)
> +
> + .p2align 4
> +LABEL(nibble_ashr_1_use):
> + sub $0x1000, %r10
> + movdqa -16(%rdi, %rdx), %xmm0
> + psrldq $1, %xmm0
> + pcmpistri $0x3a,%xmm0, %xmm0
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp %r11, %rcx
> + jae LABEL(nibble_ashr_exit_use)
> +# endif
> + cmp $14, %ecx
> + ja LABEL(nibble_ashr_1_restart_use)
> +
> + jmp LABEL(nibble_ashr_exit_use)
> +
> +/*
> + * The following cases will be handled by ashr_2
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2
> + */
> + .p2align 4
> +LABEL(ashr_2):
> + pslldq $14, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> + UPDATE_STRNCMP_COUNTER
> +
> + mov $16, %rcx /* index for loads */
> + mov $2, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 2(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> +
> + .p2align 4
> +LABEL(loop_ashr_2_use):
> + add $16, %r10
> + jg LABEL(nibble_ashr_2_use)
> +
> +LABEL(nibble_ashr_2_restart_use):
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $2, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rdx
> + add $16, %r10
> + jg LABEL(nibble_ashr_2_use)
> +
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $2, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rdx
> + jmp LABEL(loop_ashr_2_use)
> +
> + .p2align 4
> +LABEL(nibble_ashr_2_use):
> + sub $0x1000, %r10
> + movdqa -16(%rdi, %rdx), %xmm0
> + psrldq $2, %xmm0
> + pcmpistri $0x3a,%xmm0, %xmm0
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp %r11, %rcx
> + jae LABEL(nibble_ashr_exit_use)
> +# endif
> + cmp $13, %ecx
> + ja LABEL(nibble_ashr_2_restart_use)
> +
> + jmp LABEL(nibble_ashr_exit_use)
> +
> +/*
> + * The following cases will be handled by ashr_3
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3
> + */
> + .p2align 4
> +LABEL(ashr_3):
> + pslldq $13, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + mov $16, %rcx /* index for loads */
> + mov $3, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 3(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> +
> +LABEL(loop_ashr_3_use):
> + add $16, %r10
> + jg LABEL(nibble_ashr_3_use)
> +
> +LABEL(nibble_ashr_3_restart_use):
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $3, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rdx
> + add $16, %r10
> + jg LABEL(nibble_ashr_3_use)
> +
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $3, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rdx
> + jmp LABEL(loop_ashr_3_use)
> +
> + .p2align 4
> +LABEL(nibble_ashr_3_use):
> + sub $0x1000, %r10
> + movdqa -16(%rdi, %rdx), %xmm0
> + psrldq $3, %xmm0
> + pcmpistri $0x3a,%xmm0, %xmm0
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp %r11, %rcx
> + jae LABEL(nibble_ashr_exit_use)
> +# endif
> + cmp $12, %ecx
> + ja LABEL(nibble_ashr_3_restart_use)
> +
> + jmp LABEL(nibble_ashr_exit_use)
> +
> +/*
> + * The following cases will be handled by ashr_4
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4
> + */
> + .p2align 4
> +LABEL(ashr_4):
> + pslldq $12, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + mov $16, %rcx /* index for loads */
> + mov $4, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 4(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> +
> + .p2align 4
> +LABEL(loop_ashr_4_use):
> + add $16, %r10
> + jg LABEL(nibble_ashr_4_use)
> +
> +LABEL(nibble_ashr_4_restart_use):
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $4, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rdx
> + add $16, %r10
> + jg LABEL(nibble_ashr_4_use)
> +
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $4, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rdx
> + jmp LABEL(loop_ashr_4_use)
> +
> + .p2align 4
> +LABEL(nibble_ashr_4_use):
> + sub $0x1000, %r10
> + movdqa -16(%rdi, %rdx), %xmm0
> + psrldq $4, %xmm0
> + pcmpistri $0x3a,%xmm0, %xmm0
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp %r11, %rcx
> + jae LABEL(nibble_ashr_exit_use)
> +# endif
> + cmp $11, %ecx
> + ja LABEL(nibble_ashr_4_restart_use)
> +
> + jmp LABEL(nibble_ashr_exit_use)
> +
> +/*
> + * The following cases will be handled by ashr_5
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5
> + */
> + .p2align 4
> +LABEL(ashr_5):
> + pslldq $11, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + mov $16, %rcx /* index for loads */
> + mov $5, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 5(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> +
> + .p2align 4
> +LABEL(loop_ashr_5_use):
> + add $16, %r10
> + jg LABEL(nibble_ashr_5_use)
> +
> +LABEL(nibble_ashr_5_restart_use):
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $5, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rdx
> + add $16, %r10
> + jg LABEL(nibble_ashr_5_use)
> +
> + movdqa (%rdi, %rdx), %xmm0
> +
> + palignr $5, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rdx
> + jmp LABEL(loop_ashr_5_use)
> +
> + .p2align 4
> +LABEL(nibble_ashr_5_use):
> + sub $0x1000, %r10
> + movdqa -16(%rdi, %rdx), %xmm0
> + psrldq $5, %xmm0
> + pcmpistri $0x3a,%xmm0, %xmm0
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp %r11, %rcx
> + jae LABEL(nibble_ashr_exit_use)
> +# endif
> + cmp $10, %ecx
> + ja LABEL(nibble_ashr_5_restart_use)
> +
> + jmp LABEL(nibble_ashr_exit_use)
> +
> +/*
> + * The following cases will be handled by ashr_6
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6
> + */
> + .p2align 4
> +LABEL(ashr_6):
> + pslldq $10, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + mov $16, %rcx /* index for loads */
> + mov $6, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 6(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> +
> + .p2align 4
> +LABEL(loop_ashr_6_use):
> + add $16, %r10
> + jg LABEL(nibble_ashr_6_use)
> +
> +LABEL(nibble_ashr_6_restart_use):
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $6, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rdx
> + add $16, %r10
> + jg LABEL(nibble_ashr_6_use)
> +
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $6, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rdx
> + jmp LABEL(loop_ashr_6_use)
> +
> + .p2align 4
> +LABEL(nibble_ashr_6_use):
> + sub $0x1000, %r10
> + movdqa -16(%rdi, %rdx), %xmm0
> + psrldq $6, %xmm0
> + pcmpistri $0x3a,%xmm0, %xmm0
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp %r11, %rcx
> + jae LABEL(nibble_ashr_exit_use)
> +# endif
> + cmp $9, %ecx
> + ja LABEL(nibble_ashr_6_restart_use)
> +
> + jmp LABEL(nibble_ashr_exit_use)
> +
> +/*
> + * The following cases will be handled by ashr_7
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7
> + */
> + .p2align 4
> +LABEL(ashr_7):
> + pslldq $9, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + mov $16, %rcx /* index for loads */
> + mov $7, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 7(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> +
> + .p2align 4
> +LABEL(loop_ashr_7_use):
> + add $16, %r10
> + jg LABEL(nibble_ashr_7_use)
> +
> +LABEL(nibble_ashr_7_restart_use):
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $7, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rdx
> + add $16, %r10
> + jg LABEL(nibble_ashr_7_use)
> +
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $7, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rdx
> + jmp LABEL(loop_ashr_7_use)
> +
> + .p2align 4
> +LABEL(nibble_ashr_7_use):
> + sub $0x1000, %r10
> + movdqa -16(%rdi, %rdx), %xmm0
> + psrldq $7, %xmm0
> + pcmpistri $0x3a,%xmm0, %xmm0
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp %r11, %rcx
> + jae LABEL(nibble_ashr_exit_use)
> +# endif
> + cmp $8, %ecx
> + ja LABEL(nibble_ashr_7_restart_use)
> +
> + jmp LABEL(nibble_ashr_exit_use)
> +
> +/*
> + * The following cases will be handled by ashr_8
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8
> + */
> + .p2align 4
> +LABEL(ashr_8):
> + pslldq $8, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + mov $16, %rcx /* index for loads */
> + mov $8, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 8(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> +
> + .p2align 4
> +LABEL(loop_ashr_8_use):
> + add $16, %r10
> + jg LABEL(nibble_ashr_8_use)
> +
> +LABEL(nibble_ashr_8_restart_use):
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $8, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rdx
> + add $16, %r10
> + jg LABEL(nibble_ashr_8_use)
> +
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $8, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rdx
> + jmp LABEL(loop_ashr_8_use)
> +
> + .p2align 4
> +LABEL(nibble_ashr_8_use):
> + sub $0x1000, %r10
> + movdqa -16(%rdi, %rdx), %xmm0
> + psrldq $8, %xmm0
> + pcmpistri $0x3a,%xmm0, %xmm0
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp %r11, %rcx
> + jae LABEL(nibble_ashr_exit_use)
> +# endif
> + cmp $7, %ecx
> + ja LABEL(nibble_ashr_8_restart_use)
> +
> + jmp LABEL(nibble_ashr_exit_use)
> +
> +/*
> + * The following cases will be handled by ashr_9
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9
> + */
> + .p2align 4
> +LABEL(ashr_9):
> + pslldq $7, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + mov $16, %rcx /* index for loads */
> + mov $9, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 9(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> +
> + .p2align 4
> +LABEL(loop_ashr_9_use):
> + add $16, %r10
> + jg LABEL(nibble_ashr_9_use)
> +
> +LABEL(nibble_ashr_9_restart_use):
> + movdqa (%rdi, %rdx), %xmm0
> +
> + palignr $9, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rdx
> + add $16, %r10
> + jg LABEL(nibble_ashr_9_use)
> +
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $9, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rdx
> + jmp LABEL(loop_ashr_9_use)
> +
> + .p2align 4
> +LABEL(nibble_ashr_9_use):
> + sub $0x1000, %r10
> + movdqa -16(%rdi, %rdx), %xmm0
> + psrldq $9, %xmm0
> + pcmpistri $0x3a,%xmm0, %xmm0
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp %r11, %rcx
> + jae LABEL(nibble_ashr_exit_use)
> +# endif
> + cmp $6, %ecx
> + ja LABEL(nibble_ashr_9_restart_use)
> +
> + jmp LABEL(nibble_ashr_exit_use)
> +
> +/*
> + * The following cases will be handled by ashr_10
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10
> + */
> + .p2align 4
> +LABEL(ashr_10):
> + pslldq $6, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + mov $16, %rcx /* index for loads */
> + mov $10, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 10(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> +
> + .p2align 4
> +LABEL(loop_ashr_10_use):
> + add $16, %r10
> + jg LABEL(nibble_ashr_10_use)
> +
> +LABEL(nibble_ashr_10_restart_use):
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $10, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rdx
> + add $16, %r10
> + jg LABEL(nibble_ashr_10_use)
> +
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $10, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rdx
> + jmp LABEL(loop_ashr_10_use)
> +
> + .p2align 4
> +LABEL(nibble_ashr_10_use):
> + sub $0x1000, %r10
> + movdqa -16(%rdi, %rdx), %xmm0
> + psrldq $10, %xmm0
> + pcmpistri $0x3a,%xmm0, %xmm0
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp %r11, %rcx
> + jae LABEL(nibble_ashr_exit_use)
> +# endif
> + cmp $5, %ecx
> + ja LABEL(nibble_ashr_10_restart_use)
> +
> + jmp LABEL(nibble_ashr_exit_use)
> +
> +/*
> + * The following cases will be handled by ashr_11
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11
> + */
> + .p2align 4
> +LABEL(ashr_11):
> + pslldq $5, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + mov $16, %rcx /* index for loads */
> + mov $11, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 11(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> +
> + .p2align 4
> +LABEL(loop_ashr_11_use):
> + add $16, %r10
> + jg LABEL(nibble_ashr_11_use)
> +
> +LABEL(nibble_ashr_11_restart_use):
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $11, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rdx
> + add $16, %r10
> + jg LABEL(nibble_ashr_11_use)
> +
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $11, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rdx
> + jmp LABEL(loop_ashr_11_use)
> +
> + .p2align 4
> +LABEL(nibble_ashr_11_use):
> + sub $0x1000, %r10
> + movdqa -16(%rdi, %rdx), %xmm0
> + psrldq $11, %xmm0
> + pcmpistri $0x3a,%xmm0, %xmm0
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp %r11, %rcx
> + jae LABEL(nibble_ashr_exit_use)
> +# endif
> + cmp $4, %ecx
> + ja LABEL(nibble_ashr_11_restart_use)
> +
> + jmp LABEL(nibble_ashr_exit_use)
> +
> +/*
> + * The following cases will be handled by ashr_12
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12
> + */
> + .p2align 4
> +LABEL(ashr_12):
> + pslldq $4, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + mov $16, %rcx /* index for loads */
> + mov $12, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 12(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> +
> + .p2align 4
> +LABEL(loop_ashr_12_use):
> + add $16, %r10
> + jg LABEL(nibble_ashr_12_use)
> +
> +LABEL(nibble_ashr_12_restart_use):
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $12, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rdx
> + add $16, %r10
> + jg LABEL(nibble_ashr_12_use)
> +
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $12, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rdx
> + jmp LABEL(loop_ashr_12_use)
> +
> + .p2align 4
> +LABEL(nibble_ashr_12_use):
> + sub $0x1000, %r10
> + movdqa -16(%rdi, %rdx), %xmm0
> + psrldq $12, %xmm0
> + pcmpistri $0x3a,%xmm0, %xmm0
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp %r11, %rcx
> + jae LABEL(nibble_ashr_exit_use)
> +# endif
> + cmp $3, %ecx
> + ja LABEL(nibble_ashr_12_restart_use)
> +
> + jmp LABEL(nibble_ashr_exit_use)
> +
> +/*
> + * The following cases will be handled by ashr_13
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13
> + */
> + .p2align 4
> +LABEL(ashr_13):
> + pslldq $3, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + mov $16, %rcx /* index for loads */
> + mov $13, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 13(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> +
> + .p2align 4
> +LABEL(loop_ashr_13_use):
> + add $16, %r10
> + jg LABEL(nibble_ashr_13_use)
> +
> +LABEL(nibble_ashr_13_restart_use):
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $13, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rdx
> + add $16, %r10
> + jg LABEL(nibble_ashr_13_use)
> +
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $13, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rdx
> + jmp LABEL(loop_ashr_13_use)
> +
> + .p2align 4
> +LABEL(nibble_ashr_13_use):
> + sub $0x1000, %r10
> + movdqa -16(%rdi, %rdx), %xmm0
> + psrldq $13, %xmm0
> + pcmpistri $0x3a,%xmm0, %xmm0
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp %r11, %rcx
> + jae LABEL(nibble_ashr_exit_use)
> +# endif
> + cmp $2, %ecx
> + ja LABEL(nibble_ashr_13_restart_use)
> +
> + jmp LABEL(nibble_ashr_exit_use)
> +
> +/*
> + * The following cases will be handled by ashr_14
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14
> + */
> + .p2align 4
> +LABEL(ashr_14):
> + pslldq $2, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + mov $16, %rcx /* index for loads */
> + mov $14, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 14(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> +
> + .p2align 4
> +LABEL(loop_ashr_14_use):
> + add $16, %r10
> + jg LABEL(nibble_ashr_14_use)
> +
> +LABEL(nibble_ashr_14_restart_use):
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $14, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rdx
> + add $16, %r10
> + jg LABEL(nibble_ashr_14_use)
> +
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $14, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rdx
> + jmp LABEL(loop_ashr_14_use)
> +
> + .p2align 4
> +LABEL(nibble_ashr_14_use):
> + sub $0x1000, %r10
> + movdqa -16(%rdi, %rdx), %xmm0
> + psrldq $14, %xmm0
> + pcmpistri $0x3a,%xmm0, %xmm0
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp %r11, %rcx
> + jae LABEL(nibble_ashr_exit_use)
> +# endif
> + cmp $1, %ecx
> + ja LABEL(nibble_ashr_14_restart_use)
> +
> + jmp LABEL(nibble_ashr_exit_use)
> +
> +/*
> + * The following cases will be handled by ashr_15
> + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
> + * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15
> + */
> + .p2align 4
> +LABEL(ashr_15):
> + pslldq $1, %xmm2
> + TOLOWER (%xmm1, %xmm2)
> + pcmpeqb %xmm1, %xmm2
> + psubb %xmm0, %xmm2
> + pmovmskb %xmm2, %r9d
> + shr %cl, %edx
> + shr %cl, %r9d
> + sub %r9d, %edx
> + jnz LABEL(less32bytes)
> +
> + movdqa (%rdi), %xmm3
> +
> + UPDATE_STRNCMP_COUNTER
> +
> + mov $16, %rcx /* index for loads */
> + mov $15, %r9d /* byte position left over from less32bytes case */
> + /*
> + * Setup %r10 value allows us to detect crossing a page boundary.
> + * When %r10 goes positive we have crossed a page boundary and
> + * need to do a nibble.
> + */
> + lea 15(%rdi), %r10
> + and $0xfff, %r10 /* offset into 4K page */
> +
> + sub $0x1000, %r10 /* subtract 4K pagesize */
> +
> + mov %rcx, %rdx /* only for offset of sse4 instruction loop*/
> +
> + .p2align 4
> +LABEL(loop_ashr_15_use):
> + add $16, %r10
> + jg LABEL(nibble_ashr_15_use)
> +
> +LABEL(nibble_ashr_15_restart_use):
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $15, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> +
> + add $16, %rdx
> + add $16, %r10
> + jg LABEL(nibble_ashr_15_use)
> +
> + movdqa (%rdi, %rdx), %xmm0
> + palignr $15, -16(%rdi, %rdx), %xmm0
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a, (%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + jbe LABEL(exit_use)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub $16, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add $16, %rdx
> + jmp LABEL(loop_ashr_15_use)
> +
> + .p2align 4
> +LABEL(nibble_ashr_15_use):
> + sub $0x1000, %r10
> + movdqa -16(%rdi, %rdx), %xmm0
> + psrldq $15, %xmm0
> + pcmpistri $0x3a,%xmm0, %xmm0
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + cmp %r11, %rcx
> + jae LABEL(nibble_ashr_exit_use)
> +# endif
> + cmp $0, %ecx
> + ja LABEL(nibble_ashr_15_restart_use)
> +
> +LABEL(nibble_ashr_exit_use):
> +# if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L
> + pcmpistri $0x1a,(%rsi,%rdx), %xmm0
> +# else
> + movdqa (%rsi,%rdx), %xmm1
> + TOLOWER (%xmm0, %xmm1)
> + pcmpistri $0x1a, %xmm1, %xmm0
> +# endif
> + .p2align 4
> +LABEL(exit_use):
> + jnc LABEL(strcmp_exitz)
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub %rcx, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + add %rcx, %rdx
> + lea -16(%rdi, %r9), %rdi
> + movzbl (%rdi, %rdx), %eax
> + movzbl (%rsi, %rdx), %edx
> + test %r8d, %r8d
> + jz LABEL(ret_use)
> + xchg %eax, %edx
> +LABEL(ret_use):
> +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rcx
> + movl (%rcx,%rdx,4), %edx
> + movl (%rcx,%rax,4), %eax
> +# endif
> +
> + sub %edx, %eax
> + ret
> +
> +LABEL(less32bytes):
> + lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */
> + lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */
> + test %r8d, %r8d
> + jz LABEL(ret)
> + xchg %rsi, %rdi /* recover original order according to flag(%r8d) */
> +
> + .p2align 4
> +LABEL(ret):
> +LABEL(less16bytes):
> + bsf %rdx, %rdx /* find and store bit index in %rdx */
> +
> +# if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
> + sub %rdx, %r11
> + jbe LABEL(strcmp_exitz)
> +# endif
> + movzbl (%rsi, %rdx), %ecx
> + movzbl (%rdi, %rdx), %eax
> +
> +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
> + movl (%rdx,%rcx,4), %ecx
> + movl (%rdx,%rax,4), %eax
> +# endif
> +
> + sub %ecx, %eax
> + ret
> +
> +LABEL(strcmp_exitz):
> + xor %eax, %eax
> + ret
> +
> + .p2align 4
> + // XXX Same as code above
> +LABEL(Byte0):
> + movzbl (%rsi), %ecx
> + movzbl (%rdi), %eax
> +
> +# if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
> + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
> + movl (%rdx,%rcx,4), %ecx
> + movl (%rdx,%rax,4), %eax
> +# endif
> +
> + sub %ecx, %eax
> + ret
> + cfi_endproc
> + .size STRCMP, .-STRCMP
> +
> +# undef UCLOW_reg
> +# undef UCHIGH_reg
> +# undef LCQWORD_reg
> +# undef TOLOWER
> +
> + /* Put all SSE 4.2 functions together. */
> + .section .rodata.SECTION,"a",@progbits
> + .p2align 3
> +LABEL(unaligned_table):
> + .int LABEL(ashr_1) - LABEL(unaligned_table)
> + .int LABEL(ashr_2) - LABEL(unaligned_table)
> + .int LABEL(ashr_3) - LABEL(unaligned_table)
> + .int LABEL(ashr_4) - LABEL(unaligned_table)
> + .int LABEL(ashr_5) - LABEL(unaligned_table)
> + .int LABEL(ashr_6) - LABEL(unaligned_table)
> + .int LABEL(ashr_7) - LABEL(unaligned_table)
> + .int LABEL(ashr_8) - LABEL(unaligned_table)
> + .int LABEL(ashr_9) - LABEL(unaligned_table)
> + .int LABEL(ashr_10) - LABEL(unaligned_table)
> + .int LABEL(ashr_11) - LABEL(unaligned_table)
> + .int LABEL(ashr_12) - LABEL(unaligned_table)
> + .int LABEL(ashr_13) - LABEL(unaligned_table)
> + .int LABEL(ashr_14) - LABEL(unaligned_table)
> + .int LABEL(ashr_15) - LABEL(unaligned_table)
> + .int LABEL(ashr_0) - LABEL(unaligned_table)
> +
> +# undef LABEL
> +# undef GLABEL
> +# undef SECTION
> +# undef movdqa
> +# undef movdqu
> +# undef pmovmskb
> +# undef pcmpistri
> +# undef psubb
> +# undef pcmpeqb
> +# undef psrldq
> +# undef pslldq
> +# undef palignr
> +# undef pxor
> +# undef D
> #endif
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S b/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S
> index 08e23548c3..1ce5c4e93f 100644
> --- a/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S
> +++ b/sysdeps/x86_64/multiarch/strncase_l-sse4_2.S
> @@ -16,6 +16,5 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#define STRCMP_SSE42 __strncasecmp_l_sse42
> #define USE_AS_STRNCASECMP_L
> -#include "strcmp-sse42.S"
> +#include "strcmp-sse4_2.S"
> diff --git a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S
> index 310a6dbe77..2a02f0c2a6 100644
> --- a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S
> +++ b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S
> @@ -16,8 +16,5 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#if IS_IN (libc)
> -# define STRCMP_SSE42 __strncmp_sse42
> -# define USE_AS_STRNCMP
> -# include "strcmp-sse42.S"
> -#endif
> +#define USE_AS_STRNCMP
> +#include "strcmp-sse4_2.S"
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH v1 1/4] x86: Rename STRCASECMP_NONASCII macro to STRCASECMP_L_NONASCII
2022-07-12 19:28 [PATCH v1 1/4] x86: Rename STRCASECMP_NONASCII macro to STRCASECMP_L_NONASCII Noah Goldstein
` (2 preceding siblings ...)
2022-07-12 19:28 ` [PATCH v1 4/4] x86: Move strcmp SSE42 implementation to multiarch/strcmp-sse4_2.S Noah Goldstein
@ 2022-07-12 23:30 ` H.J. Lu
3 siblings, 0 replies; 8+ messages in thread
From: H.J. Lu @ 2022-07-12 23:30 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jul 12, 2022 at 12:28 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The previous macro name can be confusing given that both
> `__strcasecmp_l_nonascii` and `__strcasecmp_nonascii` are
> functions and we use the `_l` version.
> ---
> sysdeps/x86_64/multiarch/strcmp-avx2.S | 6 +++---
> sysdeps/x86_64/multiarch/strcmp-evex.S | 6 +++---
> 2 files changed, 6 insertions(+), 6 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> index 516cde1145..3ab21e3a58 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
> @@ -89,12 +89,12 @@
> # define STRCASECMP __strncasecmp_avx2
> # define LOCALE_REG rcx
> # define LOCALE_REG_LP RCX_LP
> -# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> +# define STRCASECMP_L_NONASCII __strncasecmp_l_nonascii
> # else
> # define STRCASECMP __strcasecmp_avx2
> # define LOCALE_REG rdx
> # define LOCALE_REG_LP RDX_LP
> -# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> +# define STRCASECMP_L_NONASCII __strcasecmp_l_nonascii
> # endif
> # endif
>
> @@ -215,7 +215,7 @@ STRCMP:
> mov (%LOCALE_REG), %RAX_LP
> # endif
> testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> - jne STRCASECMP_NONASCII
> + jne STRCASECMP_L_NONASCII
> leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> # endif
>
> diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
> index e97d51bb26..afbf13a230 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
> @@ -123,12 +123,12 @@
> # define STRCASECMP __strncasecmp_evex
> # define LOCALE_REG rcx
> # define LOCALE_REG_LP RCX_LP
> -# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
> +# define STRCASECMP_L_NONASCII __strncasecmp_l_nonascii
> # else
> # define STRCASECMP __strcasecmp_evex
> # define LOCALE_REG rdx
> # define LOCALE_REG_LP RDX_LP
> -# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
> +# define STRCASECMP_L_NONASCII __strcasecmp_l_nonascii
> # endif
> # endif
>
> @@ -241,7 +241,7 @@ STRCMP:
> mov (%LOCALE_REG), %RAX_LP
> # endif
> testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> - jne STRCASECMP_NONASCII
> + jne STRCASECMP_L_NONASCII
> leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
> # endif
>
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 8+ messages in thread