From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 7844) id 8E8D43889806; Fri, 25 Mar 2022 18:19:02 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 8E8D43889806 Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Noah Goldstein To: glibc-cvs@sourceware.org Subject: [glibc] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S X-Act-Checkin: glibc X-Git-Author: Noah Goldstein X-Git-Refname: refs/heads/master X-Git-Oldrev: 46d19d1b83c52cc92d8215c7a1a032bcab3d271c X-Git-Newrev: 670b54bc585ea4a94f3b2e9272ba44aa6b730b73 Message-Id: <20220325181902.8E8D43889806@sourceware.org> Date: Fri, 25 Mar 2022 18:19:02 +0000 (GMT) X-BeenThere: glibc-cvs@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Glibc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 25 Mar 2022 18:19:02 -0000 https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=670b54bc585ea4a94f3b2e9272ba44aa6b730b73 commit 670b54bc585ea4a94f3b2e9272ba44aa6b730b73 Author: Noah Goldstein Date: Wed Mar 23 16:57:36 2022 -0500 x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S Slightly faster method of doing TOLOWER that saves an instruction. Also replace the hard coded 5-byte no with .p2align 4. On builds with CET enabled this misaligned entry to strcasecmp. geometric_mean(N=40) of all benchmarks New / Original: .894 All string/memory tests pass. Reviewed-by: H.J. Lu Diff: --- sysdeps/x86_64/strcmp.S | 64 ++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S index e2ab59c555..99d8b36f1d 100644 --- a/sysdeps/x86_64/strcmp.S +++ b/sysdeps/x86_64/strcmp.S @@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp) movq __libc_tsd_LOCALE@gottpoff(%rip),%rax mov %fs:(%rax),%RDX_LP - // XXX 5 byte should be before the function - /* 5-byte NOP. */ - .byte 0x0f,0x1f,0x44,0x00,0x00 + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 END2 (__strcasecmp) # ifndef NO_NOLOCALE_ALIAS weak_alias (__strcasecmp, strcasecmp) @@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp) movq __libc_tsd_LOCALE@gottpoff(%rip),%rax mov %fs:(%rax),%RCX_LP - // XXX 5 byte should be before the function - /* 5-byte NOP. */ - .byte 0x0f,0x1f,0x44,0x00,0x00 + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ + .p2align 4 END2 (__strncasecmp) # ifndef NO_NOLOCALE_ALIAS weak_alias (__strncasecmp, strncasecmp) @@ -146,22 +144,22 @@ ENTRY (STRCMP) #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L .section .rodata.cst16,"aM",@progbits,16 .align 16 -.Lbelowupper: - .quad 0x4040404040404040 - .quad 0x4040404040404040 -.Ltopupper: - .quad 0x5b5b5b5b5b5b5b5b - .quad 0x5b5b5b5b5b5b5b5b -.Ltouppermask: +.Llcase_min: + .quad 0x3f3f3f3f3f3f3f3f + .quad 0x3f3f3f3f3f3f3f3f +.Llcase_max: + .quad 0x9999999999999999 + .quad 0x9999999999999999 +.Lcase_add: .quad 0x2020202020202020 .quad 0x2020202020202020 .previous - movdqa .Lbelowupper(%rip), %xmm5 -# define UCLOW_reg %xmm5 - movdqa .Ltopupper(%rip), %xmm6 -# define UCHIGH_reg %xmm6 - movdqa .Ltouppermask(%rip), %xmm7 -# define LCQWORD_reg %xmm7 + movdqa .Llcase_min(%rip), %xmm5 +# define LCASE_MIN_reg %xmm5 + movdqa .Llcase_max(%rip), %xmm6 +# define LCASE_MAX_reg %xmm6 + movdqa .Lcase_add(%rip), %xmm7 +# define CASE_ADD_reg %xmm7 #endif cmp $0x30, %ecx ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ @@ -172,22 +170,18 @@ ENTRY (STRCMP) movhpd 8(%rdi), %xmm1 movhpd 8(%rsi), %xmm2 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L -# define TOLOWER(reg1, reg2) \ - movdqa reg1, %xmm8; \ - movdqa UCHIGH_reg, %xmm9; \ - movdqa reg2, %xmm10; \ - movdqa UCHIGH_reg, %xmm11; \ - pcmpgtb UCLOW_reg, %xmm8; \ - pcmpgtb reg1, %xmm9; \ - pcmpgtb UCLOW_reg, %xmm10; \ - pcmpgtb reg2, %xmm11; \ - pand %xmm9, %xmm8; \ - pand %xmm11, %xmm10; \ - pand LCQWORD_reg, %xmm8; \ - pand LCQWORD_reg, %xmm10; \ - por %xmm8, reg1; \ - por %xmm10, reg2 - TOLOWER (%xmm1, %xmm2) +# define TOLOWER(reg1, reg2) \ + movdqa LCASE_MIN_reg, %xmm8; \ + movdqa LCASE_MIN_reg, %xmm9; \ + paddb reg1, %xmm8; \ + paddb reg2, %xmm9; \ + pcmpgtb LCASE_MAX_reg, %xmm8; \ + pcmpgtb LCASE_MAX_reg, %xmm9; \ + pandn CASE_ADD_reg, %xmm8; \ + pandn CASE_ADD_reg, %xmm9; \ + paddb %xmm8, reg1; \ + paddb %xmm9, reg2 + TOLOWER (%xmm1, %xmm2) #else # define TOLOWER(reg1, reg2) #endif