From: Noah Goldstein <goldstein.w.n@gmail.com>
To: libc-alpha@sourceware.org
Subject: [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp
Date: Wed, 23 Mar 2022 16:57:42 -0500 [thread overview]
Message-ID: <20220323215734.3927131-21-goldstein.w.n@gmail.com> (raw)
In-Reply-To: <20220323215734.3927131-1-goldstein.w.n@gmail.com>
geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702
All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, AVX2 Time / SSE42 Time
1, 1, 1, 127, 1.032
2, 2, 2, 127, 1.006
3, 3, 3, 127, 1.009
4, 4, 4, 127, 0.964
5, 5, 5, 127, 0.929
6, 6, 6, 127, 0.94
7, 7, 7, 127, 0.958
8, 0, 0, 127, 0.988
9, 1, 1, 127, 0.99
10, 2, 2, 127, 0.995
11, 3, 3, 127, 0.991
12, 4, 4, 127, 0.975
13, 5, 5, 127, 0.943
14, 6, 6, 127, 0.955
15, 7, 7, 127, 0.988
4, 0, 0, 127, 0.983
4, 0, 0, 254, 0.978
8, 0, 0, 254, 0.989
16, 0, 0, 127, 0.792
16, 0, 0, 254, 0.774
32, 0, 0, 127, 0.568
32, 0, 0, 254, 0.555
64, 0, 0, 127, 0.561
64, 0, 0, 254, 0.561
128, 0, 0, 127, 0.574
128, 0, 0, 254, 0.577
256, 0, 0, 127, 0.561
256, 0, 0, 254, 0.552
512, 0, 0, 127, 0.59
512, 0, 0, 254, 0.594
1024, 0, 0, 127, 0.528
1024, 0, 0, 254, 0.517
16, 1, 2, 127, 0.758
16, 2, 1, 254, 0.748
32, 2, 4, 127, 0.419
32, 4, 2, 254, 0.428
64, 3, 6, 127, 0.472
64, 6, 3, 254, 0.464
128, 4, 0, 127, 0.534
128, 0, 4, 254, 0.53
256, 5, 2, 127, 0.679
256, 2, 5, 254, 0.676
512, 6, 4, 127, 0.525
512, 4, 6, 254, 0.523
1024, 7, 6, 127, 0.518
1024, 6, 7, 254, 0.505
sysdeps/x86_64/multiarch/Makefile | 4 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 +++
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 12 +
.../x86_64/multiarch/strcasecmp_l-avx2-rtm.S | 15 ++
sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S | 23 ++
sysdeps/x86_64/multiarch/strcmp-avx2.S | 230 +++++++++++++++---
.../x86_64/multiarch/strncase_l-avx2-rtm.S | 16 ++
sysdeps/x86_64/multiarch/strncase_l-avx2.S | 27 ++
8 files changed, 324 insertions(+), 31 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e7b413edad..06e1848823 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -55,6 +55,8 @@ sysdep_routines += \
stpncpy-sse2-unaligned \
stpncpy-ssse3 \
strcasecmp_l-avx \
+ strcasecmp_l-avx2 \
+ strcasecmp_l-avx2-rtm \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
strcasecmp_l-ssse3 \
@@ -93,6 +95,8 @@ sysdep_routines += \
strlen-evex \
strlen-sse2 \
strncase_l-avx \
+ strncase_l-avx2 \
+ strncase_l-avx2-rtm \
strncase_l-sse2 \
strncase_l-sse4_2 \
strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a594f4176e..3c556d07ac 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strcasecmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strcasecmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (AVX),
__strcasecmp_avx)
@@ -449,6 +456,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
IFUNC_IMPL (i, name, strcasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strcasecmp_l_avx2)
+ IFUNC_IMPL_ADD (array, i, strcasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strcasecmp_l_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strcasecmp_l,
CPU_FEATURE_USABLE (AVX),
__strcasecmp_l_avx)
@@ -576,6 +590,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strncasecmp_avx2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strncasecmp_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (AVX),
__strncasecmp_avx)
@@ -590,6 +611,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncase_l.c. */
IFUNC_IMPL (i, name, strncasecmp_l,
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ CPU_FEATURE_USABLE (AVX2),
+ __strncasecmp_l_avx2)
+ IFUNC_IMPL_ADD (array, i, strncasecmp,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __strncasecmp_l_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strncasecmp_l,
CPU_FEATURE_USABLE (AVX),
__strncasecmp_l_avx)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 9e3cc61ac0..c4de111fd0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
+
if (CPU_FEATURE_USABLE_P (cpu_features, AVX))
return OPTIMIZE (avx);
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
new file mode 100644
index 0000000000..09957fc3c5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S
@@ -0,0 +1,15 @@
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x) x ## _rtm
+#define GLABEL(x) _GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+
+#include "strcasecmp_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
new file mode 100644
index 0000000000..e2762f2a22
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with AVX2.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strcasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 86a86b68e3..eeb90a0da6 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -20,6 +20,10 @@
# include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+# include "locale-defines.h"
+# endif
+
# ifndef STRCMP
# define STRCMP __strcmp_avx2
# endif
@@ -74,13 +78,88 @@
# define VEC_OFFSET (-VEC_SIZE)
# endif
+# ifdef USE_AS_STRCASECMP_L
+# define BYTE_LOOP_REG OFFSET_REG
+# else
+# define BYTE_LOOP_REG ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+# ifdef USE_AS_STRNCMP
+# define STRCASECMP __strncasecmp_avx2
+# define LOCALE_REG rcx
+# define LOCALE_REG_LP RCX_LP
+# define STRCASECMP_NONASCII __strncasecmp_l_nonascii
+# else
+# define STRCASECMP __strcasecmp_avx2
+# define LOCALE_REG rdx
+# define LOCALE_REG_LP RDX_LP
+# define STRCASECMP_NONASCII __strcasecmp_l_nonascii
+# endif
+# endif
+
# define xmmZERO xmm15
# define ymmZERO ymm15
+# define LCASE_MIN_ymm %ymm10
+# define LCASE_MAX_ymm %ymm11
+# define CASE_ADD_ymm %ymm12
+
+# define LCASE_MIN_xmm %xmm10
+# define LCASE_MAX_xmm %xmm11
+# define CASE_ADD_xmm %xmm12
+
+ /* r11 is never use elsewhere so this is safe to maintain. */
+# define TOLOWER_BASE %r11
+
# ifndef SECTION
# define SECTION(p) p##.avx
# endif
+# ifdef USE_AS_STRCASECMP_L
+# define REG(x, y) x ## y
+# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \
+ vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \
+ vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \
+ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \
+ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \
+ vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \
+ vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \
+ vpaddb REG(%ext, 8), reg1_in, reg1_out; \
+ vpaddb REG(%ext, 9), reg2_in, reg2_out
+
+# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm)
+# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm)
+
+# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \
+ TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \
+ VPCMPEQ scratch_reg, s2_reg, reg_out
+
+# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \
+ VMOVU s2_mem, reg_out; \
+ CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext)
+
+# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm)
+# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm)
+
+# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm)
+# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm)
+
+# else
+# define TOLOWER_gpr(...)
+# define TOLOWER_ymm(...)
+# define TOLOWER_xmm(...)
+
+# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \
+ VPCMPEQ s2_reg, s1_reg, reg_out
+
+# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+
+# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__)
+# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__)
+# endif
+
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
strcmp/strncmp have to use UNSIGNED comparison for elements.
@@ -102,7 +181,45 @@
returned. */
.section SECTION(.text), "ax", @progbits
-ENTRY(STRCMP)
+ .align 16
+ .type STRCMP, @function
+ .globl STRCMP
+ .hidden STRCMP
+
+# ifndef GLABEL
+# define GLABEL(...) __VA_ARGS__
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (GLABEL(STRCASECMP))
+ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax
+ mov %fs:(%rax), %LOCALE_REG_LP
+
+ /* Either 1 or 5 bytes (dependeing if CET is enabled). */
+ .p2align 4
+END (GLABEL(STRCASECMP))
+ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */
+# endif
+
+ .p2align 4
+STRCMP:
+ cfi_startproc
+ _CET_ENDBR
+ CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+ /* We have to fall back on the C implementation for locales with
+ encodings not matching ASCII for single bytes. */
+# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+# else
+ mov (%LOCALE_REG), %RAX_LP
+# endif
+ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+ jne STRCASECMP_NONASCII
+ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
# ifdef USE_AS_STRNCMP
# ifdef __ILP32__
/* Clear the upper 32 bits. */
@@ -128,6 +245,30 @@ ENTRY(STRCMP)
# endif
# endif
vpxor %xmmZERO, %xmmZERO, %xmmZERO
+# if defined USE_AS_STRCASECMP_L
+ .section .rodata.cst32, "aM", @progbits, 32
+ .align 32
+L(lcase_min):
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+ .quad 0x3f3f3f3f3f3f3f3f
+L(lcase_max):
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+ .quad 0x9999999999999999
+L(case_add):
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .quad 0x2020202020202020
+ .previous
+
+ vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm
+ vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm
+ vmovdqa L(case_add)(%rip), CASE_ADD_ymm
+# endif
movl %edi, %eax
orl %esi, %eax
sall $20, %eax
@@ -138,8 +279,10 @@ ENTRY(STRCMP)
L(no_page_cross):
/* Safe to compare 4x vectors. */
VMOVU (%rdi), %ymm0
- /* 1s where s1 and s2 equal. */
- VPCMPEQ (%rsi), %ymm0, %ymm1
+ /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp.
+ Otherwise converts ymm0 and load from rsi to lower. ymm2 is
+ scratch and ymm1 is the return. */
+ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
/* 1s at null CHAR. */
VPCMPEQ %ymm0, %ymmZERO, %ymm2
/* 1s where s1 and s2 equal AND not null CHAR. */
@@ -172,6 +315,8 @@ L(return_vec_0):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret0):
@@ -207,6 +352,8 @@ L(one_or_less):
# else
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret1):
@@ -234,6 +381,8 @@ L(return_vec_1):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret2):
@@ -265,6 +414,8 @@ L(return_vec_2):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret3):
@@ -285,6 +436,8 @@ L(return_vec_3):
# else
movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
# endif
L(ret4):
@@ -295,7 +448,7 @@ L(ret4):
L(more_3x_vec):
/* Safe to compare 4x vectors. */
VMOVU VEC_SIZE(%rdi), %ymm0
- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -308,7 +461,7 @@ L(more_3x_vec):
# endif
VMOVU (VEC_SIZE * 2)(%rdi), %ymm0
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -316,7 +469,7 @@ L(more_3x_vec):
jnz L(return_vec_2)
VMOVU (VEC_SIZE * 3)(%rdi), %ymm0
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -391,12 +544,10 @@ L(loop_skip_page_cross_check):
VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
/* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */
- VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1
-
- VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
-
+ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1)
+ CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3)
+ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
/* If any mismatches or null CHAR then 0 CHAR, otherwise non-
zero. */
@@ -465,6 +616,8 @@ L(return_vec_2_3_end):
# else
movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax
movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -508,6 +661,8 @@ L(return_vec_0_end):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -530,6 +685,8 @@ L(return_vec_1_end):
# else
movzbl VEC_SIZE(%rdi, %rcx), %eax
movzbl VEC_SIZE(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -556,6 +713,8 @@ L(return_vec_2_end):
# else
movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -583,7 +742,7 @@ L(page_cross_during_loop):
jle L(less_1x_vec_till_page_cross)
VMOVA (%rdi), %ymm0
- VPCMPEQ (%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -605,7 +764,7 @@ L(less_1x_vec_till_page_cross):
here, it means the previous page (rdi - VEC_SIZE) has already
been loaded earlier so must be valid. */
VMOVU -VEC_SIZE(%rdi, %rax), %ymm0
- VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -647,6 +806,8 @@ L(return_page_cross_cmp_mem):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -673,7 +834,7 @@ L(more_2x_vec_till_page_cross):
iteration here. */
VMOVU VEC_SIZE(%rdi), %ymm0
- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -689,7 +850,7 @@ L(more_2x_vec_till_page_cross):
/* Safe to include comparisons from lower bytes. */
VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0
- VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -697,7 +858,7 @@ L(more_2x_vec_till_page_cross):
jnz L(return_vec_page_cross_0)
VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0
- VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -715,8 +876,8 @@ L(more_2x_vec_till_page_cross):
VMOVA (VEC_SIZE * 2)(%rdi), %ymm4
VMOVA (VEC_SIZE * 3)(%rdi), %ymm6
- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5
- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7
+ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5)
+ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7)
vpand %ymm4, %ymm5, %ymm5
vpand %ymm6, %ymm7, %ymm7
VPMINU %ymm5, %ymm7, %ymm7
@@ -767,6 +928,8 @@ L(return_vec_page_cross_1):
# else
movzbl VEC_OFFSET(%rdi, %rcx), %eax
movzbl VEC_OFFSET(%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -822,7 +985,7 @@ L(page_cross):
L(page_cross_loop):
VMOVU (%rdi, %OFFSET_REG64), %ymm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -840,11 +1003,11 @@ L(page_cross_loop):
subl %eax, %OFFSET_REG
/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
to not cross page so is safe to load. Since we have already
- loaded at least 1 VEC from rsi it is also guranteed to be safe.
- */
+ loaded at least 1 VEC from rsi it is also guranteed to be
+ safe. */
VMOVU (%rdi, %OFFSET_REG64), %ymm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1
+ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1)
VPCMPEQ %ymm0, %ymmZERO, %ymm2
vpandn %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %ecx
@@ -877,6 +1040,8 @@ L(ret_vec_page_cross_cont):
# else
movzbl (%rdi, %rcx), %eax
movzbl (%rsi, %rcx), %ecx
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %ecx)
subl %ecx, %eax
xorl %r8d, %eax
subl %r8d, %eax
@@ -930,7 +1095,7 @@ L(less_1x_vec_till_page):
ja L(less_16_till_page)
VMOVU (%rdi), %xmm0
- VPCMPEQ (%rsi), %xmm0, %xmm1
+ CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1)
VPCMPEQ %xmm0, %xmmZERO, %xmm2
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
@@ -948,7 +1113,7 @@ L(less_1x_vec_till_page):
# endif
VMOVU (%rdi, %OFFSET_REG64), %xmm0
- VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1
+ CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1)
VPCMPEQ %xmm0, %xmmZERO, %xmm2
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
@@ -986,7 +1151,7 @@ L(less_16_till_page):
vmovq (%rdi), %xmm0
vmovq (%rsi), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
incb %cl
@@ -1006,7 +1171,7 @@ L(less_16_till_page):
vmovq (%rdi, %OFFSET_REG64), %xmm0
vmovq (%rsi, %OFFSET_REG64), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
incb %cl
@@ -1062,7 +1227,7 @@ L(ret_less_8_wcs):
vmovd (%rdi), %xmm0
vmovd (%rsi), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
subl $0xf, %ecx
@@ -1081,7 +1246,7 @@ L(ret_less_8_wcs):
vmovd (%rdi, %OFFSET_REG64), %xmm0
vmovd (%rsi, %OFFSET_REG64), %xmm1
VPCMPEQ %xmm0, %xmmZERO, %xmm2
- VPCMPEQ %xmm1, %xmm0, %xmm1
+ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1)
vpandn %xmm1, %xmm2, %xmm1
vpmovmskb %ymm1, %ecx
subl $0xf, %ecx
@@ -1115,7 +1280,9 @@ L(less_4_till_page):
L(less_4_loop):
movzbl (%rdi), %eax
movzbl (%rsi, %rdi), %ecx
- subl %ecx, %eax
+ TOLOWER_gpr (%rax, %eax)
+ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+ subl %BYTE_LOOP_REG, %eax
jnz L(ret_less_4_loop)
testl %ecx, %ecx
jz L(ret_zero_4_loop)
@@ -1142,5 +1309,6 @@ L(ret_less_4_loop):
subl %r8d, %eax
ret
# endif
-END(STRCMP)
+ cfi_endproc
+ .size STRCMP, .-STRCMP
#endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
new file mode 100644
index 0000000000..e194936c36
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S
@@ -0,0 +1,16 @@
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_avx2_rtm
+#endif
+
+#define _GLABEL(x) x ## _rtm
+#define GLABEL(x) _GLABEL(x)
+
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
+
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
+
+#define SECTION(p) p##.avx.rtm
+#define OVERFLOW_STRCMP __strcasecmp_avx2_rtm
+
+#include "strncase_l-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
new file mode 100644
index 0000000000..29afccbcc5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S
@@ -0,0 +1,27 @@
+/* strncasecmp_l optimized with AVX2.
+ Copyright (C) 2017-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef STRCMP
+# define STRCMP __strncasecmp_l_avx2
+#endif
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#ifndef OVERFLOW_STRCMP
+# define OVERFLOW_STRCMP __strcasecmp_avx2
+#endif
+#include "strcmp-avx2.S"
--
2.25.1
next prev parent reply other threads:[~2022-03-23 22:02 UTC|newest]
Thread overview: 76+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
2022-03-23 21:57 ` [PATCH v1 02/23] benchtests: Add random benchmark " Noah Goldstein
2022-03-24 18:44 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch Noah Goldstein
2022-03-24 18:53 ` H.J. Lu
2022-03-24 19:20 ` Noah Goldstein
2022-03-24 19:36 ` H.J. Lu
2022-05-12 19:31 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 04/23] x86: Code cleanup in strchr-evex " Noah Goldstein
2022-03-24 18:54 ` H.J. Lu
2022-05-12 19:32 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 05/23] benchtests: Use json-lib in bench-strpbrk.c Noah Goldstein
2022-03-24 18:54 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 06/23] benchtests: Use json-lib in bench-strspn.c Noah Goldstein
2022-03-24 18:54 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c Noah Goldstein
2022-03-24 18:55 ` H.J. Lu
2022-05-12 19:34 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c Noah Goldstein
2022-03-24 18:56 ` H.J. Lu
2022-05-12 19:39 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation Noah Goldstein
2022-03-24 18:57 ` H.J. Lu
2022-05-12 19:40 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 10/23] x86: Remove strpbrk-sse2.S " Noah Goldstein
2022-03-24 18:57 ` H.J. Lu
2022-05-12 19:41 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 11/23] x86: Remove strspn-sse2.S " Noah Goldstein
2022-03-24 18:57 ` H.J. Lu
2022-05-12 19:42 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] Noah Goldstein
2022-03-24 18:59 ` H.J. Lu
2022-03-24 19:18 ` Noah Goldstein
2022-03-24 19:34 ` H.J. Lu
2022-03-24 19:39 ` Noah Goldstein
2022-03-24 20:50 ` [PATCH v2 12/31] " Noah Goldstein
2022-03-24 21:26 ` H.J. Lu
2022-03-24 21:43 ` Noah Goldstein
2022-03-24 21:58 ` H.J. Lu
2022-05-04 6:05 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 13/23] benchtests: Use json-lib in bench-strcasecmp.c Noah Goldstein
2022-03-24 19:00 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 14/23] benchtests: Use json-lib in bench-strncasecmp.c Noah Goldstein
2022-03-24 19:00 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 15/23] string: Expand page cross tests in test-strcasecmp.c Noah Goldstein
2022-03-24 19:01 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 16/23] string: Expand page cross tests in test-strncasecmp.c Noah Goldstein
2022-03-24 19:01 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S Noah Goldstein
2022-03-24 19:02 ` H.J. Lu
2022-05-12 19:44 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S Noah Goldstein
2022-03-24 19:02 ` H.J. Lu
2022-05-12 19:45 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 19/23] string: Expand page cross test cases in test-strcmp.c Noah Goldstein
2022-03-24 19:02 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 20/23] string: Expand page cross test cases in test-strncmp.c Noah Goldstein
2022-03-24 19:02 ` H.J. Lu
2022-03-23 21:57 ` Noah Goldstein [this message]
2022-03-24 19:03 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp H.J. Lu
2022-03-24 22:41 ` [PATCH v3 " Noah Goldstein
2022-03-24 22:41 ` [PATCH v3 22/23] x86: Add EVEX " Noah Goldstein
2022-03-24 23:56 ` [PATCH v4 21/23] x86: Add AVX2 " Noah Goldstein
2022-03-24 23:56 ` [PATCH v4 22/23] x86: Add EVEX " Noah Goldstein
2022-03-25 18:15 ` H.J. Lu
2022-03-25 18:18 ` Noah Goldstein
2022-05-12 19:47 ` Sunil Pandey
2022-05-12 19:52 ` Sunil Pandey
2022-03-25 18:14 ` [PATCH v4 21/23] x86: Add AVX2 " H.J. Lu
2022-05-12 19:52 ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 22/23] x86: Add EVEX " Noah Goldstein
2022-03-24 19:04 ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 23/23] x86: Remove AVX str{n}casecmp Noah Goldstein
2022-03-24 19:04 ` H.J. Lu
2022-05-12 19:54 ` Sunil Pandey
2022-03-24 18:43 ` [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c H.J. Lu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220323215734.3927131-21-goldstein.w.n@gmail.com \
--to=goldstein.w.n@gmail.com \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).