From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-vk1-xa36.google.com (mail-vk1-xa36.google.com [IPv6:2607:f8b0:4864:20::a36]) by sourceware.org (Postfix) with ESMTPS id C2662388457E; Thu, 12 May 2022 19:44:49 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org C2662388457E Received: by mail-vk1-xa36.google.com with SMTP id m203so3179670vke.13; Thu, 12 May 2022 12:44:49 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=vWZn+JQwPkeRIR3GDiM/lp/UXnXqtDKRdVAAvIR2Imc=; b=NI6s11NviYSkmt6QU+1fVGlxcjLsCvu6rDyabW+bQnxLWbzmO/LUqDv+YCREy8rCDP oiystqlugoZA9w9+SACLcNbmsmXgynLwoEAySn+BO0rDAssCeehfikbHufEBO0mMkzzV yb0OznUwMMzLxpiJS0ozgWADH9GXunXVgy4uc80zE10WgMGhlHJukxXYu2Ht73quTfHM 4xOJ/1BtlAFkvS9JSWsCVVNdl1LfmQBClJaQpXnZ/RrDGd/cT5jLe4yXlRgj4E0NtbvG B+2vKXDVAeJ6Ds5V53LzbVFENNV6BbDLfuVkn9amCF5VgwJT0TufXmVpFJvVZxqCLN1F 58Hw== X-Gm-Message-State: AOAM531Zgeeujj3ll5B1otr1OxqkGX6h2mahegq3KR7IzCaEIeER8VJG Ba45sqRBeemWZO63X5Oexvru4uxMTH+3mvs1PPPI2l+dJfs= X-Google-Smtp-Source: ABdhPJxFGLaFL7OQgAgIe8vltrQNWdONbz1yxV26bCScZ0UN/RTH1ERcNm7L9/CyHjxgQxNxwrY8vB0xNS3MY0aO1JI= X-Received: by 2002:a1f:2c0b:0:b0:345:e29e:cb24 with SMTP id s11-20020a1f2c0b000000b00345e29ecb24mr1059444vks.7.1652384689130; Thu, 12 May 2022 12:44:49 -0700 (PDT) MIME-Version: 1.0 References: <20220323215734.3927131-1-goldstein.w.n@gmail.com> <20220323215734.3927131-17-goldstein.w.n@gmail.com> In-Reply-To: From: Sunil Pandey Date: Thu, 12 May 2022 12:44:13 -0700 Message-ID: Subject: Re: [PATCH v1 17/23] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S To: "H.J. Lu" , Libc-stable Mailing List Cc: Noah Goldstein , GNU C Library Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-7.2 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_ENVFROM_END_DIGIT, FREEMAIL_FROM, GIT_PATCH_0, HK_RANDOM_ENVFROM, HK_RANDOM_FROM, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-stable@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-stable mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 12 May 2022 19:44:51 -0000 On Thu, Mar 24, 2022 at 12:05 PM H.J. Lu via Libc-alpha wrote: > > On Wed, Mar 23, 2022 at 3:01 PM Noah Goldstein wrote: > > > > Slightly faster method of doing TOLOWER that saves an > > instruction. > > > > Also replace the hard coded 5-byte no with .p2align 4. On builds with > > CET enabled this misaligned entry to strcasecmp. > > > > geometric_mean(N=40) of all benchmarks New / Original: .894 > > > > All string/memory tests pass. > > --- > > Geomtric Mean N=40 runs; All functions page aligned > > length, align1, align2, max_char, New Time / Old Time > > 1, 1, 1, 127, 0.903 > > 2, 2, 2, 127, 0.905 > > 3, 3, 3, 127, 0.877 > > 4, 4, 4, 127, 0.888 > > 5, 5, 5, 127, 0.901 > > 6, 6, 6, 127, 0.954 > > 7, 7, 7, 127, 0.932 > > 8, 0, 0, 127, 0.918 > > 9, 1, 1, 127, 0.914 > > 10, 2, 2, 127, 0.877 > > 11, 3, 3, 127, 0.909 > > 12, 4, 4, 127, 0.876 > > 13, 5, 5, 127, 0.886 > > 14, 6, 6, 127, 0.914 > > 15, 7, 7, 127, 0.939 > > 4, 0, 0, 127, 0.963 > > 4, 0, 0, 254, 0.943 > > 8, 0, 0, 254, 0.927 > > 16, 0, 0, 127, 0.876 > > 16, 0, 0, 254, 0.865 > > 32, 0, 0, 127, 0.865 > > 32, 0, 0, 254, 0.862 > > 64, 0, 0, 127, 0.863 > > 64, 0, 0, 254, 0.896 > > 128, 0, 0, 127, 0.885 > > 128, 0, 0, 254, 0.882 > > 256, 0, 0, 127, 0.87 > > 256, 0, 0, 254, 0.869 > > 512, 0, 0, 127, 0.832 > > 512, 0, 0, 254, 0.848 > > 1024, 0, 0, 127, 0.835 > > 1024, 0, 0, 254, 0.843 > > 16, 1, 2, 127, 0.914 > > 16, 2, 1, 254, 0.949 > > 32, 2, 4, 127, 0.955 > > 32, 4, 2, 254, 1.004 > > 64, 3, 6, 127, 0.844 > > 64, 6, 3, 254, 0.905 > > 128, 4, 0, 127, 0.889 > > 128, 0, 4, 254, 0.845 > > 256, 5, 2, 127, 0.929 > > 256, 2, 5, 254, 0.907 > > 512, 6, 4, 127, 0.837 > > 512, 4, 6, 254, 0.862 > > 1024, 7, 6, 127, 0.895 > > 1024, 6, 7, 254, 0.89 > > > > sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++---------------------- > > 1 file changed, 29 insertions(+), 35 deletions(-) > > > > diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S > > index e2ab59c555..99d8b36f1d 100644 > > --- a/sysdeps/x86_64/strcmp.S > > +++ b/sysdeps/x86_64/strcmp.S > > @@ -75,9 +75,8 @@ ENTRY2 (__strcasecmp) > > movq __libc_tsd_LOCALE@gottpoff(%rip),%rax > > mov %fs:(%rax),%RDX_LP > > > > - // XXX 5 byte should be before the function > > - /* 5-byte NOP. */ > > - .byte 0x0f,0x1f,0x44,0x00,0x00 > > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ > > + .p2align 4 > > END2 (__strcasecmp) > > # ifndef NO_NOLOCALE_ALIAS > > weak_alias (__strcasecmp, strcasecmp) > > @@ -94,9 +93,8 @@ ENTRY2 (__strncasecmp) > > movq __libc_tsd_LOCALE@gottpoff(%rip),%rax > > mov %fs:(%rax),%RCX_LP > > > > - // XXX 5 byte should be before the function > > - /* 5-byte NOP. */ > > - .byte 0x0f,0x1f,0x44,0x00,0x00 > > + /* Either 1 or 5 bytes (dependeing if CET is enabled). */ > > + .p2align 4 > > END2 (__strncasecmp) > > # ifndef NO_NOLOCALE_ALIAS > > weak_alias (__strncasecmp, strncasecmp) > > @@ -146,22 +144,22 @@ ENTRY (STRCMP) > > #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > > .section .rodata.cst16,"aM",@progbits,16 > > .align 16 > > -.Lbelowupper: > > - .quad 0x4040404040404040 > > - .quad 0x4040404040404040 > > -.Ltopupper: > > - .quad 0x5b5b5b5b5b5b5b5b > > - .quad 0x5b5b5b5b5b5b5b5b > > -.Ltouppermask: > > +.Llcase_min: > > + .quad 0x3f3f3f3f3f3f3f3f > > + .quad 0x3f3f3f3f3f3f3f3f > > +.Llcase_max: > > + .quad 0x9999999999999999 > > + .quad 0x9999999999999999 > > +.Lcase_add: > > .quad 0x2020202020202020 > > .quad 0x2020202020202020 > > .previous > > - movdqa .Lbelowupper(%rip), %xmm5 > > -# define UCLOW_reg %xmm5 > > - movdqa .Ltopupper(%rip), %xmm6 > > -# define UCHIGH_reg %xmm6 > > - movdqa .Ltouppermask(%rip), %xmm7 > > -# define LCQWORD_reg %xmm7 > > + movdqa .Llcase_min(%rip), %xmm5 > > +# define LCASE_MIN_reg %xmm5 > > + movdqa .Llcase_max(%rip), %xmm6 > > +# define LCASE_MAX_reg %xmm6 > > + movdqa .Lcase_add(%rip), %xmm7 > > +# define CASE_ADD_reg %xmm7 > > #endif > > cmp $0x30, %ecx > > ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ > > @@ -172,22 +170,18 @@ ENTRY (STRCMP) > > movhpd 8(%rdi), %xmm1 > > movhpd 8(%rsi), %xmm2 > > #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L > > -# define TOLOWER(reg1, reg2) \ > > - movdqa reg1, %xmm8; \ > > - movdqa UCHIGH_reg, %xmm9; \ > > - movdqa reg2, %xmm10; \ > > - movdqa UCHIGH_reg, %xmm11; \ > > - pcmpgtb UCLOW_reg, %xmm8; \ > > - pcmpgtb reg1, %xmm9; \ > > - pcmpgtb UCLOW_reg, %xmm10; \ > > - pcmpgtb reg2, %xmm11; \ > > - pand %xmm9, %xmm8; \ > > - pand %xmm11, %xmm10; \ > > - pand LCQWORD_reg, %xmm8; \ > > - pand LCQWORD_reg, %xmm10; \ > > - por %xmm8, reg1; \ > > - por %xmm10, reg2 > > - TOLOWER (%xmm1, %xmm2) > > +# define TOLOWER(reg1, reg2) \ > > + movdqa LCASE_MIN_reg, %xmm8; \ > > + movdqa LCASE_MIN_reg, %xmm9; \ > > + paddb reg1, %xmm8; \ > > + paddb reg2, %xmm9; \ > > + pcmpgtb LCASE_MAX_reg, %xmm8; \ > > + pcmpgtb LCASE_MAX_reg, %xmm9; \ > > + pandn CASE_ADD_reg, %xmm8; \ > > + pandn CASE_ADD_reg, %xmm9; \ > > + paddb %xmm8, reg1; \ > > + paddb %xmm9, reg2 > > + TOLOWER (%xmm1, %xmm2) > > #else > > # define TOLOWER(reg1, reg2) > > #endif > > -- > > 2.25.1 > > > > LGTM. > > Reviewed-by: H.J. Lu > > Thanks. > > -- > H.J. I would like to backport this patch to release branches. Any comments or objections? --Sunil