From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pg1-x530.google.com (mail-pg1-x530.google.com [IPv6:2607:f8b0:4864:20::530]) by sourceware.org (Postfix) with ESMTPS id 7A13D3857810 for ; Wed, 23 Jun 2021 05:16:04 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 7A13D3857810 Received: by mail-pg1-x530.google.com with SMTP id e20so809074pgg.0 for ; Tue, 22 Jun 2021 22:16:04 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=Y1Z96DG0pXkeQLLN/F9ASEkEwMYXT4lJIwxnrXC6B7s=; b=KQBm2kjqxdQ3h9cWI3sWEBGkIqnTkCdypL905XiLxlrpVv3T2wERZ91PpnuZLcxXay AbRMyaU4U8jJ7i+Y40h4ZzxEZ1BK00AE4pl5qrlpCLhep4PpjnQOqIcpeUjmVeVbRZT8 +h+KZlv2O+TngGbXqYXkEpEU8CeLIbnK0LD+2kgqylG2QzmZ/csCFYI6oGUJ1plLN/rq 8wQd1e0cSUV7L5dDuSfARz9LwvWs6wJUKKIaqnO/WOmIY4hrd8rnKXoRZ+d3TJy20uSd zoq+ZJ5IjoSYDq833vZpUNJcAsrA6Y+Vto0UgyUxfywnsSMEueuMEkRdRzNtzLTtqBtq vhPA== X-Gm-Message-State: AOAM53073j8CW8UmWyFcXxVhHOQjudOSyJWvlSvEa/4Q/XUCS/0gijVg gFgvgLPHxABoVc3xQZFfSr16yw8to0sblmPsSdpDJuvs X-Google-Smtp-Source: ABdhPJxyLWNVSJnDWPaNQLn4gGgf3rCP5hZFEMQ+eNq2ThO5LhG5EkTnLEb5aPEjIhQ3n729/WMR5Xg8/W8KNcCv9Ck= X-Received: by 2002:a65:4985:: with SMTP id r5mr2157507pgs.122.1624425363572; Tue, 22 Jun 2021 22:16:03 -0700 (PDT) MIME-Version: 1.0 References: <20210623034740.3006880-1-hjl.tools@gmail.com> In-Reply-To: <20210623034740.3006880-1-hjl.tools@gmail.com> From: Noah Goldstein Date: Wed, 23 Jun 2021 01:15:52 -0400 Message-ID: Subject: Re: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S To: "H.J. Lu" Cc: GNU C Library X-Spam-Status: No, score=-8.9 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, HTML_MESSAGE, KAM_SHORT, RCVD_IN_DNSWL_NONE, SCC_5_SHORT_WORD_LINES, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org Content-Type: text/plain; charset="UTF-8" X-Content-Filtered-By: Mailman/MimeDel 2.1.29 X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 23 Jun 2021 05:16:08 -0000 On Tue, Jun 22, 2021 at 11:47 PM H.J. Lu wrote: > Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1 > version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S > and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants. > This also removes the unused symbols, __GI___strlen_sse2 and > __GI___wcsnlen_sse4_1. > --- > sysdeps/x86_64/multiarch/strlen-sse2.S | 2 +- > sysdeps/x86_64/multiarch/strlen-vec.S | 257 ++++++++++++++++++++++ > sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 2 +- > sysdeps/x86_64/strlen.S | 243 +------------------- > 4 files changed, 262 insertions(+), 242 deletions(-) > create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S > > diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S > b/sysdeps/x86_64/multiarch/strlen-sse2.S > index 65769f3c2a..f10741c079 100644 > --- a/sysdeps/x86_64/multiarch/strlen-sse2.S > +++ b/sysdeps/x86_64/multiarch/strlen-sse2.S > @@ -20,4 +20,4 @@ > # define strlen __strlen_sse2 > #endif > > -#include "../strlen.S" > +#include "strlen-vec.S" > diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S > b/sysdeps/x86_64/multiarch/strlen-vec.S > new file mode 100644 > index 0000000000..8f660bb9c7 > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/strlen-vec.S > @@ -0,0 +1,257 @@ > +/* SSE2 version of strlen and SSE4.1 version of wcslen. > + Copyright (C) 2012-2021 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + . */ > + > +#include > + > +#ifdef AS_WCSLEN > +# define PMINU pminud > +# define PCMPEQ pcmpeqd > +# define SHIFT_RETURN shrq $2, %rax > +#else > +# define PMINU pminub > +# define PCMPEQ pcmpeqb > +# define SHIFT_RETURN > +#endif > + > +/* Long lived register in strlen(s), strnlen(s, n) are: > + > + %xmm3 - zero > + %rdi - s > + %r10 (s+n) & (~(64-1)) > + %r11 s+n > +*/ > + > + > +.text > +ENTRY(strlen) > + > +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ > +#define FIND_ZERO \ > + PCMPEQ (%rax), %xmm0; \ > + PCMPEQ 16(%rax), %xmm1; \ > + PCMPEQ 32(%rax), %xmm2; \ > + PCMPEQ 48(%rax), %xmm3; \ > + pmovmskb %xmm0, %esi; \ > + pmovmskb %xmm1, %edx; \ > + pmovmskb %xmm2, %r8d; \ > + pmovmskb %xmm3, %ecx; \ > + salq $16, %rdx; \ > + salq $16, %rcx; \ > + orq %rsi, %rdx; \ > + orq %r8, %rcx; \ > + salq $32, %rcx; \ > + orq %rcx, %rdx; > + > +#ifdef AS_STRNLEN > +/* Do not read anything when n==0. */ > + test %RSI_LP, %RSI_LP > + jne L(n_nonzero) > + xor %rax, %rax > + ret > +L(n_nonzero): > +# ifdef AS_WCSLEN > + shl $2, %RSI_LP > +# endif > + > +/* Initialize long lived registers. */ > + > + add %RDI_LP, %RSI_LP > + mov %RSI_LP, %R10_LP > + and $-64, %R10_LP > + mov %RSI_LP, %R11_LP > +#endif > + > + pxor %xmm0, %xmm0 > + pxor %xmm1, %xmm1 > + pxor %xmm2, %xmm2 > + pxor %xmm3, %xmm3 > + movq %rdi, %rax > + movq %rdi, %rcx > + andq $4095, %rcx > +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ > + cmpq $4047, %rcx > +/* We cannot unify this branching as it would be ~6 cycles slower. */ > + ja L(cross_page) > + > +#ifdef AS_STRNLEN > +/* Test if end is among first 64 bytes. */ > +# define STRNLEN_PROLOG \ > + mov %r11, %rsi; \ > + subq %rax, %rsi; \ > + andq $-64, %rax; \ > + testq $-64, %rsi; \ > + je L(strnlen_ret) > +#else > +# define STRNLEN_PROLOG andq $-64, %rax; > +#endif > + > +/* Ignore bits in mask that come before start of string. */ > +#define PROLOG(lab) \ > + movq %rdi, %rcx; \ > + xorq %rax, %rcx; \ > + STRNLEN_PROLOG; \ > + sarq %cl, %rdx; \ > + test %rdx, %rdx; \ > + je L(lab); \ > + bsfq %rdx, %rax; \ > + SHIFT_RETURN; \ > + ret > + > +#ifdef AS_STRNLEN > + andq $-16, %rax > + FIND_ZERO > +#else > + /* Test first 16 bytes unaligned. */ > + movdqu (%rax), %xmm4 > + PCMPEQ %xmm0, %xmm4 > + pmovmskb %xmm4, %edx > + test %edx, %edx > + je L(next48_bytes) > + bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ > + SHIFT_RETURN > + ret > + > +L(next48_bytes): > +/* Same as FIND_ZERO except we do not check first 16 bytes. */ > + andq $-16, %rax > + PCMPEQ 16(%rax), %xmm1 > + PCMPEQ 32(%rax), %xmm2 > + PCMPEQ 48(%rax), %xmm3 > + pmovmskb %xmm1, %edx > + pmovmskb %xmm2, %r8d > + pmovmskb %xmm3, %ecx > + salq $16, %rdx > + salq $16, %rcx > + orq %r8, %rcx > + salq $32, %rcx > + orq %rcx, %rdx > +#endif > + > + /* When no zero byte is found xmm1-3 are zero so we do not have to > + zero them. */ > + PROLOG(loop) > + > + .p2align 4 > +L(cross_page): > + andq $-64, %rax > + FIND_ZERO > + PROLOG(loop_init) > + > +#ifdef AS_STRNLEN > +/* We must do this check to correctly handle strnlen (s, -1). */ > +L(strnlen_ret): > + bts %rsi, %rdx > + sarq %cl, %rdx > + test %rdx, %rdx > + je L(loop_init) > + bsfq %rdx, %rax > + SHIFT_RETURN > + ret > +#endif > + .p2align 4 > +L(loop_init): > + pxor %xmm1, %xmm1 > + pxor %xmm2, %xmm2 > + pxor %xmm3, %xmm3 > +#ifdef AS_STRNLEN > + .p2align 4 > +L(loop): > + > + addq $64, %rax > + cmpq %rax, %r10 > + je L(exit_end) > + > + movdqa (%rax), %xmm0 > + PMINU 16(%rax), %xmm0 > + PMINU 32(%rax), %xmm0 > + PMINU 48(%rax), %xmm0 > + PCMPEQ %xmm3, %xmm0 > + pmovmskb %xmm0, %edx > + testl %edx, %edx > + jne L(exit) > + jmp L(loop) > + > + .p2align 4 > +L(exit_end): > + cmp %rax, %r11 > + je L(first) /* Do not read when end is at page boundary. */ > + pxor %xmm0, %xmm0 > + FIND_ZERO > + > +L(first): > + bts %r11, %rdx > + bsfq %rdx, %rdx > + addq %rdx, %rax > + subq %rdi, %rax > + SHIFT_RETURN > + ret > + > + .p2align 4 > +L(exit): > + pxor %xmm0, %xmm0 > + FIND_ZERO > + > + bsfq %rdx, %rdx > + addq %rdx, %rax > + subq %rdi, %rax > + SHIFT_RETURN > + ret > + > +#else > + > + /* Main loop. Unrolled twice to improve L2 cache performance on > core2. */ > + .p2align 4 > +L(loop): > + > + movdqa 64(%rax), %xmm0 > + PMINU 80(%rax), %xmm0 > + PMINU 96(%rax), %xmm0 > + PMINU 112(%rax), %xmm0 > + PCMPEQ %xmm3, %xmm0 > + pmovmskb %xmm0, %edx > + testl %edx, %edx > + jne L(exit64) > + > + subq $-128, %rax > + > + movdqa (%rax), %xmm0 > + PMINU 16(%rax), %xmm0 > + PMINU 32(%rax), %xmm0 > + PMINU 48(%rax), %xmm0 > + PCMPEQ %xmm3, %xmm0 > + pmovmskb %xmm0, %edx > + testl %edx, %edx > + jne L(exit0) > + jmp L(loop) > + > + .p2align 4 > +L(exit64): > + addq $64, %rax > +L(exit0): > + pxor %xmm0, %xmm0 > + FIND_ZERO > + > + bsfq %rdx, %rdx > + addq %rdx, %rax > + subq %rdi, %rax > + SHIFT_RETURN > + ret > + > +#endif > + > +END(strlen) > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S > b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S > index a8cab0cb00..5fa51fe07c 100644 > --- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S > +++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S > @@ -2,4 +2,4 @@ > #define AS_STRNLEN > #define strlen __wcsnlen_sse4_1 > > -#include "../strlen.S" > +#include "strlen-vec.S" > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S > index d223ea1700..8422c15cc8 100644 > --- a/sysdeps/x86_64/strlen.S > +++ b/sysdeps/x86_64/strlen.S > @@ -1,5 +1,5 @@ > -/* SSE2 version of strlen/wcslen. > - Copyright (C) 2012-2021 Free Software Foundation, Inc. > +/* SSE2 version of strlen. > + Copyright (C) 2021 Free Software Foundation, Inc. > This file is part of the GNU C Library. > > The GNU C Library is free software; you can redistribute it and/or > @@ -16,243 +16,6 @@ > License along with the GNU C Library; if not, see > . */ > > -#include > +#include "multiarch/strlen-vec.S" > > -#ifdef AS_WCSLEN > -# define PMINU pminud > -# define PCMPEQ pcmpeqd > -# define SHIFT_RETURN shrq $2, %rax > -#else > -# define PMINU pminub > -# define PCMPEQ pcmpeqb > -# define SHIFT_RETURN > -#endif > - > -/* Long lived register in strlen(s), strnlen(s, n) are: > - > - %xmm3 - zero > - %rdi - s > - %r10 (s+n) & (~(64-1)) > - %r11 s+n > -*/ > - > - > -.text > -ENTRY(strlen) > - > -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ > -#define FIND_ZERO \ > - PCMPEQ (%rax), %xmm0; \ > - PCMPEQ 16(%rax), %xmm1; \ > - PCMPEQ 32(%rax), %xmm2; \ > - PCMPEQ 48(%rax), %xmm3; \ > - pmovmskb %xmm0, %esi; \ > - pmovmskb %xmm1, %edx; \ > - pmovmskb %xmm2, %r8d; \ > - pmovmskb %xmm3, %ecx; \ > - salq $16, %rdx; \ > - salq $16, %rcx; \ > - orq %rsi, %rdx; \ > - orq %r8, %rcx; \ > - salq $32, %rcx; \ > - orq %rcx, %rdx; > - > -#ifdef AS_STRNLEN > -/* Do not read anything when n==0. */ > - test %RSI_LP, %RSI_LP > - jne L(n_nonzero) > - xor %rax, %rax > - ret > -L(n_nonzero): > -# ifdef AS_WCSLEN > - shl $2, %RSI_LP > -# endif > - > -/* Initialize long lived registers. */ > - > - add %RDI_LP, %RSI_LP > - mov %RSI_LP, %R10_LP > - and $-64, %R10_LP > - mov %RSI_LP, %R11_LP > -#endif > - > - pxor %xmm0, %xmm0 > - pxor %xmm1, %xmm1 > - pxor %xmm2, %xmm2 > - pxor %xmm3, %xmm3 > - movq %rdi, %rax > - movq %rdi, %rcx > - andq $4095, %rcx > -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ > - cmpq $4047, %rcx > -/* We cannot unify this branching as it would be ~6 cycles slower. */ > - ja L(cross_page) > - > -#ifdef AS_STRNLEN > -/* Test if end is among first 64 bytes. */ > -# define STRNLEN_PROLOG \ > - mov %r11, %rsi; \ > - subq %rax, %rsi; \ > - andq $-64, %rax; \ > - testq $-64, %rsi; \ > - je L(strnlen_ret) > -#else > -# define STRNLEN_PROLOG andq $-64, %rax; > -#endif > - > -/* Ignore bits in mask that come before start of string. */ > -#define PROLOG(lab) \ > - movq %rdi, %rcx; \ > - xorq %rax, %rcx; \ > - STRNLEN_PROLOG; \ > - sarq %cl, %rdx; \ > - test %rdx, %rdx; \ > - je L(lab); \ > - bsfq %rdx, %rax; \ > - SHIFT_RETURN; \ > - ret > - > -#ifdef AS_STRNLEN > - andq $-16, %rax > - FIND_ZERO > -#else > - /* Test first 16 bytes unaligned. */ > - movdqu (%rax), %xmm4 > - PCMPEQ %xmm0, %xmm4 > - pmovmskb %xmm4, %edx > - test %edx, %edx > - je L(next48_bytes) > - bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ > - SHIFT_RETURN > - ret > - > -L(next48_bytes): > -/* Same as FIND_ZERO except we do not check first 16 bytes. */ > - andq $-16, %rax > - PCMPEQ 16(%rax), %xmm1 > - PCMPEQ 32(%rax), %xmm2 > - PCMPEQ 48(%rax), %xmm3 > - pmovmskb %xmm1, %edx > - pmovmskb %xmm2, %r8d > - pmovmskb %xmm3, %ecx > - salq $16, %rdx > - salq $16, %rcx > - orq %r8, %rcx > - salq $32, %rcx > - orq %rcx, %rdx > -#endif > - > - /* When no zero byte is found xmm1-3 are zero so we do not have to > - zero them. */ > - PROLOG(loop) > - > - .p2align 4 > -L(cross_page): > - andq $-64, %rax > - FIND_ZERO > - PROLOG(loop_init) > - > -#ifdef AS_STRNLEN > -/* We must do this check to correctly handle strnlen (s, -1). */ > -L(strnlen_ret): > - bts %rsi, %rdx > - sarq %cl, %rdx > - test %rdx, %rdx > - je L(loop_init) > - bsfq %rdx, %rax > - SHIFT_RETURN > - ret > -#endif > - .p2align 4 > -L(loop_init): > - pxor %xmm1, %xmm1 > - pxor %xmm2, %xmm2 > - pxor %xmm3, %xmm3 > -#ifdef AS_STRNLEN > - .p2align 4 > -L(loop): > - > - addq $64, %rax > - cmpq %rax, %r10 > - je L(exit_end) > - > - movdqa (%rax), %xmm0 > - PMINU 16(%rax), %xmm0 > - PMINU 32(%rax), %xmm0 > - PMINU 48(%rax), %xmm0 > - PCMPEQ %xmm3, %xmm0 > - pmovmskb %xmm0, %edx > - testl %edx, %edx > - jne L(exit) > - jmp L(loop) > - > - .p2align 4 > -L(exit_end): > - cmp %rax, %r11 > - je L(first) /* Do not read when end is at page boundary. */ > - pxor %xmm0, %xmm0 > - FIND_ZERO > - > -L(first): > - bts %r11, %rdx > - bsfq %rdx, %rdx > - addq %rdx, %rax > - subq %rdi, %rax > - SHIFT_RETURN > - ret > - > - .p2align 4 > -L(exit): > - pxor %xmm0, %xmm0 > - FIND_ZERO > - > - bsfq %rdx, %rdx > - addq %rdx, %rax > - subq %rdi, %rax > - SHIFT_RETURN > - ret > - > -#else > - > - /* Main loop. Unrolled twice to improve L2 cache performance on > core2. */ > - .p2align 4 > -L(loop): > - > - movdqa 64(%rax), %xmm0 > - PMINU 80(%rax), %xmm0 > - PMINU 96(%rax), %xmm0 > - PMINU 112(%rax), %xmm0 > - PCMPEQ %xmm3, %xmm0 > - pmovmskb %xmm0, %edx > - testl %edx, %edx > - jne L(exit64) > - > - subq $-128, %rax > - > - movdqa (%rax), %xmm0 > - PMINU 16(%rax), %xmm0 > - PMINU 32(%rax), %xmm0 > - PMINU 48(%rax), %xmm0 > - PCMPEQ %xmm3, %xmm0 > - pmovmskb %xmm0, %edx > - testl %edx, %edx > - jne L(exit0) > - jmp L(loop) > - > - .p2align 4 > -L(exit64): > - addq $64, %rax > -L(exit0): > - pxor %xmm0, %xmm0 > - FIND_ZERO > - > - bsfq %rdx, %rdx > - addq %rdx, %rax > - subq %rdi, %rax > - SHIFT_RETURN > - ret > - > -#endif > - > -END(strlen) > libc_hidden_builtin_def (strlen) > -- > 2.31.1 > > LGTM.