public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Noah Goldstein <goldstein.w.n@gmail.com>
To: libc-alpha@sourceware.org
Subject: [PATCH v1 22/23] x86: Add EVEX optimized str{n}casecmp
Date: Wed, 23 Mar 2022 16:57:44 -0500	[thread overview]
Message-ID: <20220323215734.3927131-22-goldstein.w.n@gmail.com> (raw)
In-Reply-To: <20220323215734.3927131-1-goldstein.w.n@gmail.com>

geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621

All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, EVEX Time / SSE42 Time
     1,      1,      1,      127,                  0.871
     2,      2,      2,      127,                  0.833
     3,      3,      3,      127,                  0.851
     4,      4,      4,      127,                  0.824
     5,      5,      5,      127,                  0.791
     6,      6,      6,      127,                  0.789
     7,      7,      7,      127,                  0.804
     8,      0,      0,      127,                  0.838
     9,      1,      1,      127,                  0.837
    10,      2,      2,      127,                  0.834
    11,      3,      3,      127,                  0.839
    12,      4,      4,      127,                  0.844
    13,      5,      5,      127,                  0.796
    14,      6,      6,      127,                  0.811
    15,      7,      7,      127,                  0.838
     4,      0,      0,      127,                   0.84
     4,      0,      0,      254,                  0.823
     8,      0,      0,      254,                  0.838
    16,      0,      0,      127,                  0.669
    16,      0,      0,      254,                  0.656
    32,      0,      0,      127,                  0.488
    32,      0,      0,      254,                  0.484
    64,      0,      0,      127,                  0.492
    64,      0,      0,      254,                  0.502
   128,      0,      0,      127,                  0.508
   128,      0,      0,      254,                  0.497
   256,      0,      0,      127,                  0.574
   256,      0,      0,      254,                  0.581
   512,      0,      0,      127,                  0.573
   512,      0,      0,      254,                  0.577
  1024,      0,      0,      127,                  0.489
  1024,      0,      0,      254,                  0.485
    16,      1,      2,      127,                  0.655
    16,      2,      1,      254,                  0.646
    32,      2,      4,      127,                  0.368
    32,      4,      2,      254,                  0.376
    64,      3,      6,      127,                  0.428
    64,      6,      3,      254,                  0.426
   128,      4,      0,      127,                  0.478
   128,      0,      4,      254,                  0.473
   256,      5,      2,      127,                   0.65
   256,      2,      5,      254,                  0.654
   512,      6,      4,      127,                  0.492
   512,      4,      6,      254,                  0.489
  1024,      7,      6,      127,                  0.463
  1024,      6,      7,      254,                  0.457

 sysdeps/x86_64/multiarch/Makefile            |   2 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  16 ++
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h  |   5 +
 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S |  23 ++
 sysdeps/x86_64/multiarch/strcmp-evex.S       | 280 ++++++++++++++++---
 sysdeps/x86_64/multiarch/strncase_l-evex.S   |  25 ++
 6 files changed, 314 insertions(+), 37 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
 create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 06e1848823..35d80dc2ff 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -57,6 +57,7 @@ sysdep_routines += \
   strcasecmp_l-avx \
   strcasecmp_l-avx2 \
   strcasecmp_l-avx2-rtm \
+  strcasecmp_l-evex \
   strcasecmp_l-sse2 \
   strcasecmp_l-sse4_2 \
   strcasecmp_l-ssse3 \
@@ -97,6 +98,7 @@ sysdep_routines += \
   strncase_l-avx \
   strncase_l-avx2 \
   strncase_l-avx2-rtm \
+  strncase_l-evex \
   strncase_l-sse2 \
   strncase_l-sse4_2 \
   strncase_l-ssse3 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 3c556d07ac..f1a4d3dac2 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -436,6 +436,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcasecmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strcasecmp_avx2)
@@ -456,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strcasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strcasecmp_l_evex)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strcasecmp_l_avx2)
@@ -590,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncasecmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strncasecmp_avx2)
@@ -611,6 +623,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
+	      IFUNC_IMPL_ADD (array, i, strncasecmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __strncasecmp_l_evex)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __strncasecmp_l_avx2)
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index c4de111fd0..bf0d146e7f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 
 static inline void *
 IFUNC_SELECTOR (void)
@@ -34,6 +35,10 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+        return OPTIMIZE (evex);
+
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
         return OPTIMIZE (avx2_rtm);
 
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
new file mode 100644
index 0000000000..58642db748
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S
@@ -0,0 +1,23 @@
+/* strcasecmp_l optimized with EVEX.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strcasecmp_l_evex
+#endif
+#define USE_AS_STRCASECMP_L
+#include "strcmp-evex.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 56d8c118e4..85afd6535f 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -19,6 +19,9 @@
 #if IS_IN (libc)
 
 # include <sysdep.h>
+# if defined USE_AS_STRCASECMP_L
+#  include "locale-defines.h"
+# endif
 
 # ifndef STRCMP
 #  define STRCMP	__strcmp_evex
@@ -34,19 +37,29 @@
 # define VMOVA	vmovdqa64
 
 # ifdef USE_AS_WCSCMP
-#  define TESTEQ	subl	$0xff,
+#  ifndef OVERFLOW_STRCMP
+#   define OVERFLOW_STRCMP	__wcscmp_evex
+#  endif
+
+#  define TESTEQ	subl $0xff,
 	/* Compare packed dwords.  */
 #  define VPCMP	vpcmpd
 #  define VPMINU	vpminud
 #  define VPTESTM	vptestmd
+#  define VPTESTNM	vptestnmd
 	/* 1 dword char == 4 bytes.  */
 #  define SIZE_OF_CHAR	4
 # else
+#  ifndef OVERFLOW_STRCMP
+#   define OVERFLOW_STRCMP	__strcmp_evex
+#  endif
+
 #  define TESTEQ	incl
 	/* Compare packed bytes.  */
 #  define VPCMP	vpcmpb
 #  define VPMINU	vpminub
 #  define VPTESTM	vptestmb
+#  define VPTESTNM	vptestnmb
 	/* 1 byte char == 1 byte.  */
 #  define SIZE_OF_CHAR	1
 # endif
@@ -73,11 +86,16 @@
 #  define VEC_OFFSET	(-VEC_SIZE)
 # endif
 
-# define XMMZERO	xmm16
 # define XMM0	xmm17
 # define XMM1	xmm18
 
-# define YMMZERO	ymm16
+# define XMM10	xmm27
+# define XMM11	xmm28
+# define XMM12	xmm29
+# define XMM13	xmm30
+# define XMM14	xmm31
+
+
 # define YMM0	ymm17
 # define YMM1	ymm18
 # define YMM2	ymm19
@@ -89,6 +107,87 @@
 # define YMM8	ymm25
 # define YMM9	ymm26
 # define YMM10	ymm27
+# define YMM11	ymm28
+# define YMM12	ymm29
+# define YMM13	ymm30
+# define YMM14	ymm31
+
+# ifdef USE_AS_STRCASECMP_L
+#  define BYTE_LOOP_REG	OFFSET_REG
+# else
+#  define BYTE_LOOP_REG	ecx
+# endif
+
+# ifdef USE_AS_STRCASECMP_L
+#  ifdef USE_AS_STRNCMP
+#   define STRCASECMP	__strncasecmp_evex
+#   define LOCALE_REG	rcx
+#   define LOCALE_REG_LP	RCX_LP
+#   define STRCASECMP_NONASCII	__strncasecmp_l_nonascii
+#  else
+#   define STRCASECMP	__strcasecmp_evex
+#   define LOCALE_REG	rdx
+#   define LOCALE_REG_LP	RDX_LP
+#   define STRCASECMP_NONASCII	__strcasecmp_l_nonascii
+#  endif
+# endif
+
+# define LCASE_MIN_YMM	%YMM12
+# define LCASE_MAX_YMM	%YMM13
+# define CASE_ADD_YMM	%YMM14
+
+# define LCASE_MIN_XMM	%XMM12
+# define LCASE_MAX_XMM	%XMM13
+# define CASE_ADD_XMM	%XMM14
+
+	/* NB: wcsncmp uses r11 but strcasecmp is never used in
+	   conjunction with wcscmp.  */
+# define TOLOWER_BASE	%r11
+
+# ifdef USE_AS_STRCASECMP_L
+#  define _REG(x, y) x ## y
+#  define REG(x, y) _REG(x, y)
+#  define TOLOWER(reg1, reg2, ext)										\
+	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\
+	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\
+	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\
+	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\
+	vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\
+	vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6}
+
+#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM)
+#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)						\
+	TOLOWER	(s1_reg, s2_reg, ext);										\
+	VPCMP	$0, s1_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)				\
+	VMOVU	s2_mem, s2_reg;												\
+	CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
+
+#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
+#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
+
+#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
+#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
+
+# else
+#  define TOLOWER_gpr(...)
+#  define TOLOWER_YMM(...)
+#  define TOLOWER_XMM(...)
+
+#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)						\
+	VPCMP	$0, s2_reg, s1_reg, reg_out
+
+#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
+
+#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)				\
+	VPCMP	$0, s2_mem, s1_reg, reg_out
+
+#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
+# endif
 
 /* Warning!
            wcscmp/wcsncmp have to use SIGNED comparison for elements.
@@ -112,7 +211,41 @@
    returned.  */
 
 	.section .text.evex, "ax", @progbits
-ENTRY(STRCMP)
+	.align	16
+	.type	STRCMP, @function
+	.globl	STRCMP
+	.hidden	STRCMP
+
+# ifdef USE_AS_STRCASECMP_L
+ENTRY (STRCASECMP)
+	movq	__libc_tsd_LOCALE@gottpoff(%rip), %rax
+	mov	%fs:(%rax), %LOCALE_REG_LP
+
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
+END (STRCASECMP)
+	/* FALLTHROUGH to strcasecmp/strncasecmp_l.  */
+# endif
+
+	.p2align 4
+STRCMP:
+	cfi_startproc
+	_CET_ENDBR
+	CALL_MCOUNT
+
+# if defined USE_AS_STRCASECMP_L
+	/* We have to fall back on the C implementation for locales with
+	   encodings not matching ASCII for single bytes.  */
+#  if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0
+	mov	LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP
+#  else
+	mov	(%LOCALE_REG), %RAX_LP
+#  endif
+	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	jne	STRCASECMP_NONASCII
+	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
+# endif
+
 # ifdef USE_AS_STRNCMP
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
@@ -125,6 +258,32 @@ ENTRY(STRCMP)
 	   actually bound the buffer.  */
 	jle	L(one_or_less)
 # endif
+
+# if defined USE_AS_STRCASECMP_L
+	.section .rodata.cst32, "aM", @progbits, 32
+	.align	32
+L(lcase_min):
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+L(lcase_max):
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+L(case_add):
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.previous
+
+	vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
+	vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
+	vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
+# endif
+
 	movl	%edi, %eax
 	orl	%esi, %eax
 	/* Shift out the bits irrelivant to page boundary ([63:12]).  */
@@ -139,7 +298,7 @@ L(no_page_cross):
 	VPTESTM	%YMM0, %YMM0, %k2
 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 	   in YMM0 and 32 bytes at (%rsi).  */
-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_STRNCMP
 	cmpq	$CHAR_PER_VEC, %rdx
@@ -169,6 +328,8 @@ L(return_vec_0):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret0):
@@ -192,7 +353,7 @@ L(one_or_less):
 #  ifdef USE_AS_WCSCMP
 	/* 'nbe' covers the case where length is negative (large
 	   unsigned).  */
-	jnbe	__wcscmp_evex
+	jnbe	OVERFLOW_STRCMP
 	movl	(%rdi), %edx
 	xorl	%eax, %eax
 	cmpl	(%rsi), %edx
@@ -203,9 +364,11 @@ L(one_or_less):
 #  else
 	/* 'nbe' covers the case where length is negative (large
 	   unsigned).  */
-	jnbe	__strcmp_evex
+	jnbe	OVERFLOW_STRCMP
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret1):
@@ -233,6 +396,8 @@ L(return_vec_1):
 # else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret2):
@@ -270,6 +435,8 @@ L(return_vec_2):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 # endif
 L(ret3):
@@ -290,6 +457,8 @@ L(return_vec_3):
 #  else
 	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 #  endif
 L(ret4):
@@ -303,7 +472,7 @@ L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
 	VMOVU	(VEC_SIZE)(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1)
@@ -315,14 +484,14 @@ L(more_3x_vec):
 
 	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_2)
 
 	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_3)
@@ -381,7 +550,6 @@ L(prepare_loop_aligned):
 	subl	%esi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 
-	vpxorq	%YMMZERO, %YMMZERO, %YMMZERO
 
 	/* Loop 4x comparisons at a time.  */
 	.p2align 4
@@ -413,22 +581,35 @@ L(loop_skip_page_cross_check):
 	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
 	VPMINU	%YMM8, %YMM9, %YMM9
 
-	/* Each bit set in K1 represents a non-null CHAR in YMM8.  */
+	/* Each bit set in K1 represents a non-null CHAR in YMM9.  */
 	VPTESTM	%YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
 	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
 	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
 	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
 	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
 	   oring with YMM1. Result is stored in YMM6.  */
 	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
-
+# else
+	VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1
+	TOLOWER_YMM (%YMM0, %YMM1)
+	VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3
+	TOLOWER_YMM (%YMM2, %YMM3)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
+	TOLOWER_YMM (%YMM4, %YMM5)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
+	TOLOWER_YMM (%YMM6, %YMM7)
+	vpxorq	%YMM0, %YMM1, %YMM1
+	vpxorq	%YMM2, %YMM3, %YMM3
+	vpxorq	%YMM4, %YMM5, %YMM5
+	vpternlogd $0xde, %YMM7, %YMM1, %YMM6
+# endif
 	/* Or together YMM3, YMM5, and YMM6.  */
 	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
 
 
 	/* A non-zero CHAR in YMM6 represents a mismatch.  */
-	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
+	VPTESTNM %YMM6, %YMM6, %k0{%k1}
 	kmovd	%k0, %LOOP_REG
 
 	TESTEQ	%LOOP_REG
@@ -437,13 +618,13 @@ L(loop_skip_page_cross_check):
 
 	/* Find which VEC has the mismatch of end of string.  */
 	VPTESTM	%YMM0, %YMM0, %k1
-	VPCMP	$0, %YMMZERO, %YMM1, %k0{%k1}
+	VPTESTNM %YMM1, %YMM1, %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_0_end)
 
 	VPTESTM	%YMM2, %YMM2, %k1
-	VPCMP	$0, %YMMZERO, %YMM3, %k0{%k1}
+	VPTESTNM %YMM3, %YMM3, %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1_end)
@@ -457,7 +638,7 @@ L(return_vec_2_3_end):
 # endif
 
 	VPTESTM	%YMM4, %YMM4, %k1
-	VPCMP	$0, %YMMZERO, %YMM5, %k0{%k1}
+	VPTESTNM %YMM5, %YMM5, %k0{%k1}
 	kmovd	%k0, %ecx
 	TESTEQ	%ecx
 # if CHAR_PER_VEC <= 16
@@ -493,6 +674,8 @@ L(return_vec_3_end):
 # else
 	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
 	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -545,6 +728,8 @@ L(return_vec_0_end):
 # else
 	movzbl	(%rdi, %rcx), %eax
 	movzbl	(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
 	   logic. Subtract `r8d` after xor for zero case.  */
@@ -569,6 +754,8 @@ L(return_vec_1_end):
 #  else
 	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 	movzbl	VEC_SIZE(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -598,7 +785,7 @@ L(page_cross_during_loop):
 
 	VMOVA	(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_0_end)
@@ -619,8 +806,7 @@ L(less_1x_vec_till_page_cross):
 	   been loaded earlier so must be valid.  */
 	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2}
-
+	CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
 	/* Mask of potentially valid bits. The lower bits can be out of
 	   range comparisons (but safe regarding page crosses).  */
 
@@ -642,6 +828,8 @@ L(less_1x_vec_till_page_cross):
 
 # ifdef USE_AS_STRNCMP
 #  ifdef USE_AS_WCSCMP
+	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
+	   safe.  */
 	movl	%eax, %r11d
 	shrl	$2, %r11d
 	cmpq	%r11, %rdx
@@ -679,6 +867,8 @@ L(return_page_cross_cmp_mem):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -709,7 +899,7 @@ L(more_2x_vec_till_page_cross):
 
 	VMOVA	VEC_SIZE(%rdi), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_1_end)
@@ -724,14 +914,14 @@ L(more_2x_vec_till_page_cross):
 	/* Safe to include comparisons from lower bytes.  */
 	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_page_cross_0)
 
 	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(return_vec_page_cross_1)
@@ -740,6 +930,8 @@ L(more_2x_vec_till_page_cross):
 	/* Must check length here as length might proclude reading next
 	   page.  */
 #  ifdef USE_AS_WCSCMP
+	/* NB: strcasecmp not used with WCSCMP so this access to r11 is
+	   safe.  */
 	movl	%eax, %r11d
 	shrl	$2, %r11d
 	cmpq	%r11, %rdx
@@ -754,12 +946,19 @@ L(more_2x_vec_till_page_cross):
 	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
 	VPMINU	%YMM4, %YMM6, %YMM9
 	VPTESTM	%YMM9, %YMM9, %k1
-
+# ifndef USE_AS_STRCASECMP_L
 	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
 	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
 	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
-
-	VPCMP	$0, %YMMZERO, %YMM6, %k0{%k1}
+# else
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
+	TOLOWER_YMM (%YMM4, %YMM5)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
+	TOLOWER_YMM (%YMM6, %YMM7)
+	vpxorq	%YMM4, %YMM5, %YMM5
+	vpternlogd $0xde, %YMM7, %YMM5, %YMM6
+# endif
+	VPTESTNM %YMM6, %YMM6, %k0{%k1}
 	kmovd	%k0, %LOOP_REG
 	TESTEQ	%LOOP_REG
 	jnz	L(return_vec_2_3_end)
@@ -815,6 +1014,8 @@ L(return_vec_page_cross_1):
 # else
 	movzbl	VEC_OFFSET(%rdi, %rcx), %eax
 	movzbl	VEC_OFFSET(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -871,7 +1072,7 @@ L(page_cross):
 L(page_cross_loop):
 	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
 	kmovd	%k1, %ecx
 	TESTEQ	%ecx
 	jnz	L(check_ret_vec_page_cross)
@@ -895,7 +1096,7 @@ L(page_cross_loop):
 	 */
 	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
 	VPTESTM	%YMM0, %YMM0, %k2
-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2}
+	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
 
 	kmovd	%k1, %ecx
 # ifdef USE_AS_STRNCMP
@@ -930,6 +1131,8 @@ L(ret_vec_page_cross_cont):
 # else
 	movzbl	(%rdi, %rcx, SIZE_OF_CHAR), %eax
 	movzbl	(%rsi, %rcx, SIZE_OF_CHAR), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
 	xorl	%r8d, %eax
 	subl	%r8d, %eax
@@ -989,7 +1192,7 @@ L(less_1x_vec_till_page):
 	/* Use 16 byte comparison.  */
 	vmovdqu	(%rdi), %xmm0
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, (%rsi), %xmm0, %k1{%k2}
+	CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0xf, %ecx
@@ -1009,7 +1212,7 @@ L(less_1x_vec_till_page):
 # endif
 	vmovdqu	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2}
+	CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0xf, %ecx
@@ -1048,7 +1251,7 @@ L(less_16_till_page):
 	vmovq	(%rdi), %xmm0
 	vmovq	(%rsi), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0x3, %ecx
@@ -1068,7 +1271,7 @@ L(less_16_till_page):
 	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
 	vmovq	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 # ifdef USE_AS_WCSCMP
 	subl	$0x3, %ecx
@@ -1128,7 +1331,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi), %xmm0
 	vmovd	(%rsi), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 	subl	$0xf, %ecx
 	jnz	L(check_ret_vec_page_cross)
@@ -1143,7 +1346,7 @@ L(ret_less_8_wcs):
 	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
 	vmovd	(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1
 	VPTESTM	%xmm0, %xmm0, %k2
-	VPCMP	$0, %xmm1, %xmm0, %k1{%k2}
+	CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2}
 	kmovd	%k1, %ecx
 	subl	$0xf, %ecx
 	jnz	L(check_ret_vec_page_cross)
@@ -1176,7 +1379,9 @@ L(less_4_till_page):
 L(less_4_loop):
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi, %rdi), %ecx
-	subl	%ecx, %eax
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %BYTE_LOOP_REG)
+	subl	%BYTE_LOOP_REG, %eax
 	jnz	L(ret_less_4_loop)
 	testl	%ecx, %ecx
 	jz	L(ret_zero_4_loop)
@@ -1203,5 +1408,6 @@ L(ret_less_4_loop):
 	subl	%r8d, %eax
 	ret
 # endif
-END(STRCMP)
+	cfi_endproc
+	.size	STRCMP, .-STRCMP
 #endif
diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S
new file mode 100644
index 0000000000..b0808c1b21
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S
@@ -0,0 +1,25 @@
+/* strncasecmp_l optimized with EVEX.
+   Copyright (C) 2017-2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef STRCMP
+# define STRCMP	__strncasecmp_l_evex
+#endif
+#define OVERFLOW_STRCMP	__strcasecmp_evex
+#define USE_AS_STRCASECMP_L
+#define USE_AS_STRNCMP
+#include "strcmp-evex.S"
-- 
2.25.1


  parent reply	other threads:[~2022-03-23 22:02 UTC|newest]

Thread overview: 76+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
2022-03-23 21:57 ` [PATCH v1 02/23] benchtests: Add random benchmark " Noah Goldstein
2022-03-24 18:44   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch Noah Goldstein
2022-03-24 18:53   ` H.J. Lu
2022-03-24 19:20     ` Noah Goldstein
2022-03-24 19:36       ` H.J. Lu
2022-05-12 19:31         ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 04/23] x86: Code cleanup in strchr-evex " Noah Goldstein
2022-03-24 18:54   ` H.J. Lu
2022-05-12 19:32     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 05/23] benchtests: Use json-lib in bench-strpbrk.c Noah Goldstein
2022-03-24 18:54   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 06/23] benchtests: Use json-lib in bench-strspn.c Noah Goldstein
2022-03-24 18:54   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c Noah Goldstein
2022-03-24 18:55   ` H.J. Lu
2022-05-12 19:34     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c Noah Goldstein
2022-03-24 18:56   ` H.J. Lu
2022-05-12 19:39     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation Noah Goldstein
2022-03-24 18:57   ` H.J. Lu
2022-05-12 19:40     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 10/23] x86: Remove strpbrk-sse2.S " Noah Goldstein
2022-03-24 18:57   ` H.J. Lu
2022-05-12 19:41     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 11/23] x86: Remove strspn-sse2.S " Noah Goldstein
2022-03-24 18:57   ` H.J. Lu
2022-05-12 19:42     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] Noah Goldstein
2022-03-24 18:59   ` H.J. Lu
2022-03-24 19:18     ` Noah Goldstein
2022-03-24 19:34       ` H.J. Lu
2022-03-24 19:39         ` Noah Goldstein
2022-03-24 20:50   ` [PATCH v2 12/31] " Noah Goldstein
2022-03-24 21:26     ` H.J. Lu
2022-03-24 21:43       ` Noah Goldstein
2022-03-24 21:58         ` H.J. Lu
2022-05-04  6:05           ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 13/23] benchtests: Use json-lib in bench-strcasecmp.c Noah Goldstein
2022-03-24 19:00   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 14/23] benchtests: Use json-lib in bench-strncasecmp.c Noah Goldstein
2022-03-24 19:00   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 15/23] string: Expand page cross tests in test-strcasecmp.c Noah Goldstein
2022-03-24 19:01   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 16/23] string: Expand page cross tests in test-strncasecmp.c Noah Goldstein
2022-03-24 19:01   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S Noah Goldstein
2022-03-24 19:02   ` H.J. Lu
2022-05-12 19:44     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S Noah Goldstein
2022-03-24 19:02   ` H.J. Lu
2022-05-12 19:45     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 19/23] string: Expand page cross test cases in test-strcmp.c Noah Goldstein
2022-03-24 19:02   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 20/23] string: Expand page cross test cases in test-strncmp.c Noah Goldstein
2022-03-24 19:02   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp Noah Goldstein
2022-03-24 19:03   ` H.J. Lu
2022-03-24 22:41   ` [PATCH v3 " Noah Goldstein
2022-03-24 22:41   ` [PATCH v3 22/23] x86: Add EVEX " Noah Goldstein
2022-03-24 23:56   ` [PATCH v4 21/23] x86: Add AVX2 " Noah Goldstein
2022-03-24 23:56     ` [PATCH v4 22/23] x86: Add EVEX " Noah Goldstein
2022-03-25 18:15       ` H.J. Lu
2022-03-25 18:18         ` Noah Goldstein
2022-05-12 19:47           ` Sunil Pandey
2022-05-12 19:52             ` Sunil Pandey
2022-03-25 18:14     ` [PATCH v4 21/23] x86: Add AVX2 " H.J. Lu
2022-05-12 19:52       ` Sunil Pandey
2022-03-23 21:57 ` Noah Goldstein [this message]
2022-03-24 19:04   ` [PATCH v1 22/23] x86: Add EVEX " H.J. Lu
2022-03-23 21:57 ` [PATCH v1 23/23] x86: Remove AVX str{n}casecmp Noah Goldstein
2022-03-24 19:04   ` H.J. Lu
2022-05-12 19:54     ` Sunil Pandey
2022-03-24 18:43 ` [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c H.J. Lu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220323215734.3927131-22-goldstein.w.n@gmail.com \
    --to=goldstein.w.n@gmail.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).