public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Noah Goldstein <goldstein.w.n@gmail.com>
To: libc-alpha@sourceware.org
Subject: [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
Date: Wed, 23 Mar 2022 16:57:38 -0500	[thread overview]
Message-ID: <20220323215734.3927131-18-goldstein.w.n@gmail.com> (raw)
In-Reply-To: <20220323215734.3927131-1-goldstein.w.n@gmail.com>

Slightly faster method of doing TOLOWER that saves an
instruction.

Also replace the hard coded 5-byte no with .p2align 4. On builds with
CET enabled this misaligned entry to strcasecmp.

geometric_mean(N=40) of all benchmarks New / Original: .920

All string/memory tests pass.
---
Geomtric Mean N=40 runs; All functions page aligned
length, align1, align2, max_char, New Time / Old Time
     1,      1,      1,      127,               0.914
     2,      2,      2,      127,               0.952
     3,      3,      3,      127,               0.924
     4,      4,      4,      127,               0.995
     5,      5,      5,      127,               0.985
     6,      6,      6,      127,               1.017
     7,      7,      7,      127,               1.031
     8,      0,      0,      127,               0.967
     9,      1,      1,      127,               0.969
    10,      2,      2,      127,               0.951
    11,      3,      3,      127,               0.938
    12,      4,      4,      127,               0.937
    13,      5,      5,      127,               0.967
    14,      6,      6,      127,               0.941
    15,      7,      7,      127,               0.951
     4,      0,      0,      127,               0.959
     4,      0,      0,      254,                0.98
     8,      0,      0,      254,               0.959
    16,      0,      0,      127,               0.895
    16,      0,      0,      254,               0.901
    32,      0,      0,      127,                0.85
    32,      0,      0,      254,               0.851
    64,      0,      0,      127,               0.897
    64,      0,      0,      254,               0.895
   128,      0,      0,      127,               0.944
   128,      0,      0,      254,               0.935
   256,      0,      0,      127,               0.922
   256,      0,      0,      254,               0.913
   512,      0,      0,      127,               0.921
   512,      0,      0,      254,               0.914
  1024,      0,      0,      127,               0.845
  1024,      0,      0,      254,                0.84
    16,      1,      2,      127,               0.923
    16,      2,      1,      254,               0.955
    32,      2,      4,      127,               0.979
    32,      4,      2,      254,               0.957
    64,      3,      6,      127,               0.866
    64,      6,      3,      254,               0.849
   128,      4,      0,      127,               0.882
   128,      0,      4,      254,               0.876
   256,      5,      2,      127,               0.877
   256,      2,      5,      254,               0.882
   512,      6,      4,      127,               0.822
   512,      4,      6,      254,               0.862
  1024,      7,      6,      127,               0.903
  1024,      6,      7,      254,               0.908

 sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++--------------
 1 file changed, 35 insertions(+), 48 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index 580feb90e9..7805ae9d41 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -88,9 +88,8 @@ ENTRY (GLABEL(__strcasecmp))
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RDX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END (GLABEL(__strcasecmp))
 	/* FALLTHROUGH to strcasecmp_l.  */
 #endif
@@ -99,9 +98,8 @@ ENTRY (GLABEL(__strncasecmp))
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RCX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END (GLABEL(__strncasecmp))
 	/* FALLTHROUGH to strncasecmp_l.  */
 #endif
@@ -169,27 +167,22 @@ STRCMP_SSE42:
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	.section .rodata.cst16,"aM",@progbits,16
 	.align 16
-LABEL(belowupper):
-	.quad	0x4040404040404040
-	.quad	0x4040404040404040
-LABEL(topupper):
-# ifdef USE_AVX
-	.quad	0x5a5a5a5a5a5a5a5a
-	.quad	0x5a5a5a5a5a5a5a5a
-# else
-	.quad	0x5b5b5b5b5b5b5b5b
-	.quad	0x5b5b5b5b5b5b5b5b
-# endif
-LABEL(touppermask):
+LABEL(lcase_min):
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+LABEL(lcase_max):
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+LABEL(case_add):
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.previous
-	movdqa	LABEL(belowupper)(%rip), %xmm4
-# define UCLOW_reg %xmm4
-	movdqa	LABEL(topupper)(%rip), %xmm5
-# define UCHIGH_reg %xmm5
-	movdqa	LABEL(touppermask)(%rip), %xmm6
-# define LCQWORD_reg %xmm6
+	movdqa	LABEL(lcase_min)(%rip), %xmm4
+# define LCASE_MIN_reg %xmm4
+	movdqa	LABEL(lcase_max)(%rip), %xmm5
+# define LCASE_MAX_reg %xmm5
+	movdqa	LABEL(case_add)(%rip), %xmm6
+# define CASE_ADD_reg %xmm6
 #endif
 	cmp	$0x30, %ecx
 	ja	LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
@@ -200,32 +193,26 @@ LABEL(touppermask):
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 # ifdef USE_AVX
 #  define TOLOWER(reg1, reg2) \
-	vpcmpgtb UCLOW_reg, reg1, %xmm7;			\
-	vpcmpgtb UCHIGH_reg, reg1, %xmm8;			\
-	vpcmpgtb UCLOW_reg, reg2, %xmm9;			\
-	vpcmpgtb UCHIGH_reg, reg2, %xmm10;			\
-	vpandn	%xmm7, %xmm8, %xmm8;					\
-	vpandn	%xmm9, %xmm10, %xmm10;					\
-	vpand	LCQWORD_reg, %xmm8, %xmm8;				\
-	vpand	LCQWORD_reg, %xmm10, %xmm10;				\
-	vpor	reg1, %xmm8, reg1;					\
-	vpor	reg2, %xmm10, reg2
+	vpaddb	LCASE_MIN_reg, reg1, %xmm7;					\
+	vpaddb	LCASE_MIN_reg, reg2, %xmm8;					\
+	vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;					\
+	vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;					\
+	vpandn	CASE_ADD_reg, %xmm7, %xmm7;					\
+	vpandn	CASE_ADD_reg, %xmm8, %xmm8;					\
+	vpaddb	%xmm7, reg1, reg1;					\
+	vpaddb	%xmm8, reg2, reg2
 # else
 #  define TOLOWER(reg1, reg2) \
-	movdqa	reg1, %xmm7;					\
-	movdqa	UCHIGH_reg, %xmm8;				\
-	movdqa	reg2, %xmm9;					\
-	movdqa	UCHIGH_reg, %xmm10;				\
-	pcmpgtb	UCLOW_reg, %xmm7;				\
-	pcmpgtb	reg1, %xmm8;					\
-	pcmpgtb	UCLOW_reg, %xmm9;				\
-	pcmpgtb	reg2, %xmm10;					\
-	pand	%xmm8, %xmm7;					\
-	pand	%xmm10, %xmm9;					\
-	pand	LCQWORD_reg, %xmm7;				\
-	pand	LCQWORD_reg, %xmm9;				\
-	por	%xmm7, reg1;					\
-	por	%xmm9, reg2
+	movdqa	LCASE_MIN_reg, %xmm7;					\
+	movdqa	LCASE_MIN_reg, %xmm8;					\
+	paddb	reg1, %xmm7;					\
+	paddb	reg2, %xmm8;					\
+	pcmpgtb	LCASE_MAX_reg, %xmm7;				\
+	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
+	pandn	CASE_ADD_reg, %xmm7;					\
+	pandn	CASE_ADD_reg, %xmm8;					\
+	paddb	%xmm7, reg1;					\
+	paddb	%xmm8, reg2
 # endif
 	TOLOWER (%xmm1, %xmm2)
 #else
-- 
2.25.1


  parent reply	other threads:[~2022-03-23 22:02 UTC|newest]

Thread overview: 76+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-03-23 21:57 [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c Noah Goldstein
2022-03-23 21:57 ` [PATCH v1 02/23] benchtests: Add random benchmark " Noah Goldstein
2022-03-24 18:44   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 03/23] x86: Code cleanup in strchr-avx2 and comment justifying branch Noah Goldstein
2022-03-24 18:53   ` H.J. Lu
2022-03-24 19:20     ` Noah Goldstein
2022-03-24 19:36       ` H.J. Lu
2022-05-12 19:31         ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 04/23] x86: Code cleanup in strchr-evex " Noah Goldstein
2022-03-24 18:54   ` H.J. Lu
2022-05-12 19:32     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 05/23] benchtests: Use json-lib in bench-strpbrk.c Noah Goldstein
2022-03-24 18:54   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 06/23] benchtests: Use json-lib in bench-strspn.c Noah Goldstein
2022-03-24 18:54   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 07/23] x86: Optimize strcspn and strpbrk in strcspn-c.c Noah Goldstein
2022-03-24 18:55   ` H.J. Lu
2022-05-12 19:34     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 08/23] x86: Optimize strspn in strspn-c.c Noah Goldstein
2022-03-24 18:56   ` H.J. Lu
2022-05-12 19:39     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 09/23] x86: Remove strcspn-sse2.S and use the generic implementation Noah Goldstein
2022-03-24 18:57   ` H.J. Lu
2022-05-12 19:40     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 10/23] x86: Remove strpbrk-sse2.S " Noah Goldstein
2022-03-24 18:57   ` H.J. Lu
2022-05-12 19:41     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 11/23] x86: Remove strspn-sse2.S " Noah Goldstein
2022-03-24 18:57   ` H.J. Lu
2022-05-12 19:42     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 12/23] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] Noah Goldstein
2022-03-24 18:59   ` H.J. Lu
2022-03-24 19:18     ` Noah Goldstein
2022-03-24 19:34       ` H.J. Lu
2022-03-24 19:39         ` Noah Goldstein
2022-03-24 20:50   ` [PATCH v2 12/31] " Noah Goldstein
2022-03-24 21:26     ` H.J. Lu
2022-03-24 21:43       ` Noah Goldstein
2022-03-24 21:58         ` H.J. Lu
2022-05-04  6:05           ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 13/23] benchtests: Use json-lib in bench-strcasecmp.c Noah Goldstein
2022-03-24 19:00   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 14/23] benchtests: Use json-lib in bench-strncasecmp.c Noah Goldstein
2022-03-24 19:00   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 15/23] string: Expand page cross tests in test-strcasecmp.c Noah Goldstein
2022-03-24 19:01   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 16/23] string: Expand page cross tests in test-strncasecmp.c Noah Goldstein
2022-03-24 19:01   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S Noah Goldstein
2022-03-24 19:02   ` H.J. Lu
2022-05-12 19:44     ` Sunil Pandey
2022-03-23 21:57 ` Noah Goldstein [this message]
2022-03-24 19:02   ` [PATCH v1 18/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S H.J. Lu
2022-05-12 19:45     ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 19/23] string: Expand page cross test cases in test-strcmp.c Noah Goldstein
2022-03-24 19:02   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 20/23] string: Expand page cross test cases in test-strncmp.c Noah Goldstein
2022-03-24 19:02   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 21/23] x86: Add AVX2 optimized str{n}casecmp Noah Goldstein
2022-03-24 19:03   ` H.J. Lu
2022-03-24 22:41   ` [PATCH v3 " Noah Goldstein
2022-03-24 22:41   ` [PATCH v3 22/23] x86: Add EVEX " Noah Goldstein
2022-03-24 23:56   ` [PATCH v4 21/23] x86: Add AVX2 " Noah Goldstein
2022-03-24 23:56     ` [PATCH v4 22/23] x86: Add EVEX " Noah Goldstein
2022-03-25 18:15       ` H.J. Lu
2022-03-25 18:18         ` Noah Goldstein
2022-05-12 19:47           ` Sunil Pandey
2022-05-12 19:52             ` Sunil Pandey
2022-03-25 18:14     ` [PATCH v4 21/23] x86: Add AVX2 " H.J. Lu
2022-05-12 19:52       ` Sunil Pandey
2022-03-23 21:57 ` [PATCH v1 22/23] x86: Add EVEX " Noah Goldstein
2022-03-24 19:04   ` H.J. Lu
2022-03-23 21:57 ` [PATCH v1 23/23] x86: Remove AVX str{n}casecmp Noah Goldstein
2022-03-24 19:04   ` H.J. Lu
2022-05-12 19:54     ` Sunil Pandey
2022-03-24 18:43 ` [PATCH v1 01/23] benchtests: Use json-lib in bench-strchr.c H.J. Lu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220323215734.3927131-18-goldstein.w.n@gmail.com \
    --to=goldstein.w.n@gmail.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).