public inbox for glibc-cvs@sourceware.org
help / color / mirror / Atom feed
From: Sunil Pandey <skpgkp2@sourceware.org>
To: glibc-cvs@sourceware.org
Subject: [glibc/release/2.34/master] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
Date: Tue, 17 May 2022 03:20:41 +0000 (GMT)	[thread overview]
Message-ID: <20220517032041.59AC0385734B@sourceware.org> (raw)

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=3605c744078bb048d876298aaf12a2869e8071b8

commit 3605c744078bb048d876298aaf12a2869e8071b8
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Wed Mar 23 16:57:38 2022 -0500

    x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
    
    Slightly faster method of doing TOLOWER that saves an
    instruction.
    
    Also replace the hard coded 5-byte no with .p2align 4. On builds with
    CET enabled this misaligned entry to strcasecmp.
    
    geometric_mean(N=40) of all benchmarks New / Original: .920
    
    All string/memory tests pass.
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
    
    (cherry picked from commit d154758e618ec9324f5d339c46db0aa27e8b1226)

Diff:
---
 sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 ++++++++++++++-------------------
 1 file changed, 35 insertions(+), 48 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index 6197a723b9..a6825de819 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -89,9 +89,8 @@ ENTRY (GLABEL(__strcasecmp))
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RDX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END (GLABEL(__strcasecmp))
 	/* FALLTHROUGH to strcasecmp_l.  */
 #endif
@@ -100,9 +99,8 @@ ENTRY (GLABEL(__strncasecmp))
 	movq	__libc_tsd_LOCALE@gottpoff(%rip),%rax
 	mov	%fs:(%rax),%RCX_LP
 
-	// XXX 5 byte should be before the function
-	/* 5-byte NOP.  */
-	.byte	0x0f,0x1f,0x44,0x00,0x00
+	/* Either 1 or 5 bytes (dependeing if CET is enabled).  */
+	.p2align 4
 END (GLABEL(__strncasecmp))
 	/* FALLTHROUGH to strncasecmp_l.  */
 #endif
@@ -170,27 +168,22 @@ STRCMP_SSE42:
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	.section .rodata.cst16,"aM",@progbits,16
 	.align 16
-LABEL(belowupper):
-	.quad	0x4040404040404040
-	.quad	0x4040404040404040
-LABEL(topupper):
-# ifdef USE_AVX
-	.quad	0x5a5a5a5a5a5a5a5a
-	.quad	0x5a5a5a5a5a5a5a5a
-# else
-	.quad	0x5b5b5b5b5b5b5b5b
-	.quad	0x5b5b5b5b5b5b5b5b
-# endif
-LABEL(touppermask):
+LABEL(lcase_min):
+	.quad	0x3f3f3f3f3f3f3f3f
+	.quad	0x3f3f3f3f3f3f3f3f
+LABEL(lcase_max):
+	.quad	0x9999999999999999
+	.quad	0x9999999999999999
+LABEL(case_add):
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.previous
-	movdqa	LABEL(belowupper)(%rip), %xmm4
-# define UCLOW_reg %xmm4
-	movdqa	LABEL(topupper)(%rip), %xmm5
-# define UCHIGH_reg %xmm5
-	movdqa	LABEL(touppermask)(%rip), %xmm6
-# define LCQWORD_reg %xmm6
+	movdqa	LABEL(lcase_min)(%rip), %xmm4
+# define LCASE_MIN_reg %xmm4
+	movdqa	LABEL(lcase_max)(%rip), %xmm5
+# define LCASE_MAX_reg %xmm5
+	movdqa	LABEL(case_add)(%rip), %xmm6
+# define CASE_ADD_reg %xmm6
 #endif
 	cmp	$0x30, %ecx
 	ja	LABEL(crosscache)/* rsi: 16-byte load will cross cache line */
@@ -201,32 +194,26 @@ LABEL(touppermask):
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 # ifdef USE_AVX
 #  define TOLOWER(reg1, reg2) \
-	vpcmpgtb UCLOW_reg, reg1, %xmm7;			\
-	vpcmpgtb UCHIGH_reg, reg1, %xmm8;			\
-	vpcmpgtb UCLOW_reg, reg2, %xmm9;			\
-	vpcmpgtb UCHIGH_reg, reg2, %xmm10;			\
-	vpandn	%xmm7, %xmm8, %xmm8;					\
-	vpandn	%xmm9, %xmm10, %xmm10;					\
-	vpand	LCQWORD_reg, %xmm8, %xmm8;				\
-	vpand	LCQWORD_reg, %xmm10, %xmm10;				\
-	vpor	reg1, %xmm8, reg1;					\
-	vpor	reg2, %xmm10, reg2
+	vpaddb	LCASE_MIN_reg, reg1, %xmm7;					\
+	vpaddb	LCASE_MIN_reg, reg2, %xmm8;					\
+	vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7;					\
+	vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8;					\
+	vpandn	CASE_ADD_reg, %xmm7, %xmm7;					\
+	vpandn	CASE_ADD_reg, %xmm8, %xmm8;					\
+	vpaddb	%xmm7, reg1, reg1;					\
+	vpaddb	%xmm8, reg2, reg2
 # else
 #  define TOLOWER(reg1, reg2) \
-	movdqa	reg1, %xmm7;					\
-	movdqa	UCHIGH_reg, %xmm8;				\
-	movdqa	reg2, %xmm9;					\
-	movdqa	UCHIGH_reg, %xmm10;				\
-	pcmpgtb	UCLOW_reg, %xmm7;				\
-	pcmpgtb	reg1, %xmm8;					\
-	pcmpgtb	UCLOW_reg, %xmm9;				\
-	pcmpgtb	reg2, %xmm10;					\
-	pand	%xmm8, %xmm7;					\
-	pand	%xmm10, %xmm9;					\
-	pand	LCQWORD_reg, %xmm7;				\
-	pand	LCQWORD_reg, %xmm9;				\
-	por	%xmm7, reg1;					\
-	por	%xmm9, reg2
+	movdqa	LCASE_MIN_reg, %xmm7;					\
+	movdqa	LCASE_MIN_reg, %xmm8;					\
+	paddb	reg1, %xmm7;					\
+	paddb	reg2, %xmm8;					\
+	pcmpgtb	LCASE_MAX_reg, %xmm7;				\
+	pcmpgtb	LCASE_MAX_reg, %xmm8;				\
+	pandn	CASE_ADD_reg, %xmm7;					\
+	pandn	CASE_ADD_reg, %xmm8;					\
+	paddb	%xmm7, reg1;					\
+	paddb	%xmm8, reg2
 # endif
 	TOLOWER (%xmm1, %xmm2)
 #else


                 reply	other threads:[~2022-05-17  3:20 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220517032041.59AC0385734B@sourceware.org \
    --to=skpgkp2@sourceware.org \
    --cc=glibc-cvs@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).