From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pj1-x1030.google.com (mail-pj1-x1030.google.com [IPv6:2607:f8b0:4864:20::1030]) by sourceware.org (Postfix) with ESMTPS id 7C69C3858CDA for ; Tue, 12 Jul 2022 23:24:12 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 7C69C3858CDA Received: by mail-pj1-x1030.google.com with SMTP id o5-20020a17090a3d4500b001ef76490983so780254pjf.2 for ; Tue, 12 Jul 2022 16:24:12 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=BT0/MBB9+BXt+xRNQbSO0rwYeR0koTIsbgi5BVQ5A94=; b=CGC0s0uXpcLYO/1Ytlxul9OhAwtQc6ArXig5cnu6C2ids2jyqCw7dWbHhO332yuXRw yuu0Z6zj63DSgxuAGeFysGfZXFIJk9PrRE2OKapCRccndtwWFMavR8nlu9eTaz+jPTbG taboqEHg33RBpjWSiy2hUDgnmMK9GUSuONlMzAazgNVeFRanEKvFNy7jhjsCml+N9XW2 rMQIdNGxw9ZfxSWgYLgxmeNRkm3xGVwKzbhaLjg38gzGsWr4ksxr5rNvcllxe0rsCR8t halpm5dMicFeXxQyvpHfHwYlFRqAgADIPUqo0KjW8T8zM4EKxCc66QlXidmDK48SHWLW D3tA== X-Gm-Message-State: AJIora9nrWMqD94KZTPkNcqQnjLR8RmbWpX+GcNsLBCoEs8HVgcYJBgs f0Ac+cfU6J0Reg9pGaeu5bxRLcug1Xqtqc08MoOZ3AoX X-Google-Smtp-Source: AGRyM1sOzQQo1otwVdj0meS8DvgAlwEi8M6oT1ZZdvL/TB+GjQOLwaiiuKCkHM+5AvopTk1PAVA+9SX9/mUqZa1hWU0= X-Received: by 2002:a17:90a:eacd:b0:1ef:84c2:418d with SMTP id ev13-20020a17090aeacd00b001ef84c2418dmr6832508pjb.101.1657668251403; Tue, 12 Jul 2022 16:24:11 -0700 (PDT) MIME-Version: 1.0 References: <20220712192910.351121-1-goldstein.w.n@gmail.com> <20220712192910.351121-2-goldstein.w.n@gmail.com> In-Reply-To: <20220712192910.351121-2-goldstein.w.n@gmail.com> From: "H.J. Lu" Date: Tue, 12 Jul 2022 16:23:35 -0700 Message-ID: Subject: Re: [PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S To: Noah Goldstein Cc: GNU C Library , "Carlos O'Donell" Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-3024.2 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 12 Jul 2022 23:24:15 -0000 On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein wrote: > > This commit doesn't affect libc.so.6, its just housekeeping to prepare > for adding explicit ISA level support. > > Tested build on x86_64 and x86_32 with/without multiarch. > --- > sysdeps/x86_64/multiarch/rtld-stpcpy.S | 18 ++++ > sysdeps/x86_64/multiarch/stpcpy-sse2.S | 15 +-- > sysdeps/x86_64/multiarch/strcpy-sse2.S | 137 ++++++++++++++++++++++-- > sysdeps/x86_64/stpcpy.S | 3 +- > sysdeps/x86_64/strcpy.S | 138 +------------------------ > 5 files changed, 156 insertions(+), 155 deletions(-) > create mode 100644 sysdeps/x86_64/multiarch/rtld-stpcpy.S > > diff --git a/sysdeps/x86_64/multiarch/rtld-stpcpy.S b/sysdeps/x86_64/multiarch/rtld-stpcpy.S > new file mode 100644 > index 0000000000..914141f07f > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/rtld-stpcpy.S > @@ -0,0 +1,18 @@ > +/* Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + . */ > + > +#include "../stpcpy.S" > diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2.S b/sysdeps/x86_64/multiarch/stpcpy-sse2.S > index 078504a44e..ea9f973af3 100644 > --- a/sysdeps/x86_64/multiarch/stpcpy-sse2.S > +++ b/sysdeps/x86_64/multiarch/stpcpy-sse2.S > @@ -17,17 +17,10 @@ > . */ > > #if IS_IN (libc) > - > -# include > -# define __stpcpy __stpcpy_sse2 > - > -# undef weak_alias > -# define weak_alias(ignored1, ignored2) > -# undef libc_hidden_def > -# define libc_hidden_def(__stpcpy) > -# undef libc_hidden_builtin_def > -# define libc_hidden_builtin_def(stpcpy) > +# ifndef STRCPY > +# define STRCPY __stpcpy_sse2 > +# endif > #endif > > #define USE_AS_STPCPY > -#include > +#include "strcpy-sse2.S" > diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2.S b/sysdeps/x86_64/multiarch/strcpy-sse2.S > index f37967c441..8b5db8b13d 100644 > --- a/sysdeps/x86_64/multiarch/strcpy-sse2.S > +++ b/sysdeps/x86_64/multiarch/strcpy-sse2.S > @@ -17,12 +17,137 @@ > . */ > > #if IS_IN (libc) > +# ifndef STRCPY > +# define STRCPY __strcpy_sse2 > +# endif > +#endif > > -# include > -# define strcpy __strcpy_sse2 > +#include > > -# undef libc_hidden_builtin_def > -# define libc_hidden_builtin_def(strcpy) > -#endif > + .text > +ENTRY (STRCPY) > + movq %rsi, %rcx /* Source register. */ > + andl $7, %ecx /* mask alignment bits */ > + movq %rdi, %rdx /* Duplicate destination pointer. */ > + > + jz 5f /* aligned => start loop */ > + > + neg %ecx /* We need to align to 8 bytes. */ > + addl $8,%ecx > + /* Search the first bytes directly. */ > +0: > + movb (%rsi), %al /* Fetch a byte */ > + testb %al, %al /* Is it NUL? */ > + movb %al, (%rdx) /* Store it */ > + jz 4f /* If it was NUL, done! */ > + incq %rsi > + incq %rdx > + decl %ecx > + jnz 0b > + > +5: > + movq $0xfefefefefefefeff,%r8 > + > + /* Now the sources is aligned. Unfortunatly we cannot force > + to have both source and destination aligned, so ignore the > + alignment of the destination. */ > + .p2align 4 > +1: > + /* 1st unroll. */ > + movq (%rsi), %rax /* Read double word (8 bytes). */ > + addq $8, %rsi /* Adjust pointer for next word. */ > + movq %rax, %r9 /* Save a copy for NUL finding. */ > + addq %r8, %r9 /* add the magic value to the word. We get > + carry bits reported for each byte which > + is *not* 0 */ > + jnc 3f /* highest byte is NUL => return pointer */ > + xorq %rax, %r9 /* (word+magic)^word */ > + orq %r8, %r9 /* set all non-carry bits */ > + incq %r9 /* add 1: if one carry bit was *not* set > + the addition will not result in 0. */ > + > + jnz 3f /* found NUL => return pointer */ > + > + movq %rax, (%rdx) /* Write value to destination. */ > + addq $8, %rdx /* Adjust pointer. */ > + > + /* 2nd unroll. */ > + movq (%rsi), %rax /* Read double word (8 bytes). */ > + addq $8, %rsi /* Adjust pointer for next word. */ > + movq %rax, %r9 /* Save a copy for NUL finding. */ > + addq %r8, %r9 /* add the magic value to the word. We get > + carry bits reported for each byte which > + is *not* 0 */ > + jnc 3f /* highest byte is NUL => return pointer */ > + xorq %rax, %r9 /* (word+magic)^word */ > + orq %r8, %r9 /* set all non-carry bits */ > + incq %r9 /* add 1: if one carry bit was *not* set > + the addition will not result in 0. */ > + > + jnz 3f /* found NUL => return pointer */ > > -#include > + movq %rax, (%rdx) /* Write value to destination. */ > + addq $8, %rdx /* Adjust pointer. */ > + > + /* 3rd unroll. */ > + movq (%rsi), %rax /* Read double word (8 bytes). */ > + addq $8, %rsi /* Adjust pointer for next word. */ > + movq %rax, %r9 /* Save a copy for NUL finding. */ > + addq %r8, %r9 /* add the magic value to the word. We get > + carry bits reported for each byte which > + is *not* 0 */ > + jnc 3f /* highest byte is NUL => return pointer */ > + xorq %rax, %r9 /* (word+magic)^word */ > + orq %r8, %r9 /* set all non-carry bits */ > + incq %r9 /* add 1: if one carry bit was *not* set > + the addition will not result in 0. */ > + > + jnz 3f /* found NUL => return pointer */ > + > + movq %rax, (%rdx) /* Write value to destination. */ > + addq $8, %rdx /* Adjust pointer. */ > + > + /* 4th unroll. */ > + movq (%rsi), %rax /* Read double word (8 bytes). */ > + addq $8, %rsi /* Adjust pointer for next word. */ > + movq %rax, %r9 /* Save a copy for NUL finding. */ > + addq %r8, %r9 /* add the magic value to the word. We get > + carry bits reported for each byte which > + is *not* 0 */ > + jnc 3f /* highest byte is NUL => return pointer */ > + xorq %rax, %r9 /* (word+magic)^word */ > + orq %r8, %r9 /* set all non-carry bits */ > + incq %r9 /* add 1: if one carry bit was *not* set > + the addition will not result in 0. */ > + > + jnz 3f /* found NUL => return pointer */ > + > + movq %rax, (%rdx) /* Write value to destination. */ > + addq $8, %rdx /* Adjust pointer. */ > + jmp 1b /* Next iteration. */ > + > + /* Do the last few bytes. %rax contains the value to write. > + The loop is unrolled twice. */ > + .p2align 4 > +3: > + /* Note that stpcpy needs to return with the value of the NUL > + byte. */ > + movb %al, (%rdx) /* 1st byte. */ > + testb %al, %al /* Is it NUL. */ > + jz 4f /* yes, finish. */ > + incq %rdx /* Increment destination. */ > + movb %ah, (%rdx) /* 2nd byte. */ > + testb %ah, %ah /* Is it NUL?. */ > + jz 4f /* yes, finish. */ > + incq %rdx /* Increment destination. */ > + shrq $16, %rax /* Shift... */ > + jmp 3b /* and look at next two bytes in %rax. */ > + > +4: > +#ifdef USE_AS_STPCPY > + movq %rdx, %rax /* Destination is return value. */ > +#else > + movq %rdi, %rax /* Source is return value. */ > +#endif > + retq > +END (STRCPY) > diff --git a/sysdeps/x86_64/stpcpy.S b/sysdeps/x86_64/stpcpy.S > index ec23de1416..b097c203dd 100644 > --- a/sysdeps/x86_64/stpcpy.S > +++ b/sysdeps/x86_64/stpcpy.S > @@ -1,7 +1,6 @@ > -#define USE_AS_STPCPY > #define STRCPY __stpcpy > > -#include > +#include "multiarch/stpcpy-sse2.S" > > weak_alias (__stpcpy, stpcpy) > libc_hidden_def (__stpcpy) > diff --git a/sysdeps/x86_64/strcpy.S b/sysdeps/x86_64/strcpy.S > index 17e8073550..05f19e6e94 100644 > --- a/sysdeps/x86_64/strcpy.S > +++ b/sysdeps/x86_64/strcpy.S > @@ -16,140 +16,6 @@ > License along with the GNU C Library; if not, see > . */ > > -#include > -#include "asm-syntax.h" > - > -#ifndef USE_AS_STPCPY > -# define STRCPY strcpy > -#endif > - > - .text > -ENTRY (STRCPY) > - movq %rsi, %rcx /* Source register. */ > - andl $7, %ecx /* mask alignment bits */ > - movq %rdi, %rdx /* Duplicate destination pointer. */ > - > - jz 5f /* aligned => start loop */ > - > - neg %ecx /* We need to align to 8 bytes. */ > - addl $8,%ecx > - /* Search the first bytes directly. */ > -0: > - movb (%rsi), %al /* Fetch a byte */ > - testb %al, %al /* Is it NUL? */ > - movb %al, (%rdx) /* Store it */ > - jz 4f /* If it was NUL, done! */ > - incq %rsi > - incq %rdx > - decl %ecx > - jnz 0b > - > -5: > - movq $0xfefefefefefefeff,%r8 > - > - /* Now the sources is aligned. Unfortunatly we cannot force > - to have both source and destination aligned, so ignore the > - alignment of the destination. */ > - .p2align 4 > -1: > - /* 1st unroll. */ > - movq (%rsi), %rax /* Read double word (8 bytes). */ > - addq $8, %rsi /* Adjust pointer for next word. */ > - movq %rax, %r9 /* Save a copy for NUL finding. */ > - addq %r8, %r9 /* add the magic value to the word. We get > - carry bits reported for each byte which > - is *not* 0 */ > - jnc 3f /* highest byte is NUL => return pointer */ > - xorq %rax, %r9 /* (word+magic)^word */ > - orq %r8, %r9 /* set all non-carry bits */ > - incq %r9 /* add 1: if one carry bit was *not* set > - the addition will not result in 0. */ > - > - jnz 3f /* found NUL => return pointer */ > - > - movq %rax, (%rdx) /* Write value to destination. */ > - addq $8, %rdx /* Adjust pointer. */ > - > - /* 2nd unroll. */ > - movq (%rsi), %rax /* Read double word (8 bytes). */ > - addq $8, %rsi /* Adjust pointer for next word. */ > - movq %rax, %r9 /* Save a copy for NUL finding. */ > - addq %r8, %r9 /* add the magic value to the word. We get > - carry bits reported for each byte which > - is *not* 0 */ > - jnc 3f /* highest byte is NUL => return pointer */ > - xorq %rax, %r9 /* (word+magic)^word */ > - orq %r8, %r9 /* set all non-carry bits */ > - incq %r9 /* add 1: if one carry bit was *not* set > - the addition will not result in 0. */ > - > - jnz 3f /* found NUL => return pointer */ > - > - movq %rax, (%rdx) /* Write value to destination. */ > - addq $8, %rdx /* Adjust pointer. */ > - > - /* 3rd unroll. */ > - movq (%rsi), %rax /* Read double word (8 bytes). */ > - addq $8, %rsi /* Adjust pointer for next word. */ > - movq %rax, %r9 /* Save a copy for NUL finding. */ > - addq %r8, %r9 /* add the magic value to the word. We get > - carry bits reported for each byte which > - is *not* 0 */ > - jnc 3f /* highest byte is NUL => return pointer */ > - xorq %rax, %r9 /* (word+magic)^word */ > - orq %r8, %r9 /* set all non-carry bits */ > - incq %r9 /* add 1: if one carry bit was *not* set > - the addition will not result in 0. */ > - > - jnz 3f /* found NUL => return pointer */ > - > - movq %rax, (%rdx) /* Write value to destination. */ > - addq $8, %rdx /* Adjust pointer. */ > - > - /* 4th unroll. */ > - movq (%rsi), %rax /* Read double word (8 bytes). */ > - addq $8, %rsi /* Adjust pointer for next word. */ > - movq %rax, %r9 /* Save a copy for NUL finding. */ > - addq %r8, %r9 /* add the magic value to the word. We get > - carry bits reported for each byte which > - is *not* 0 */ > - jnc 3f /* highest byte is NUL => return pointer */ > - xorq %rax, %r9 /* (word+magic)^word */ > - orq %r8, %r9 /* set all non-carry bits */ > - incq %r9 /* add 1: if one carry bit was *not* set > - the addition will not result in 0. */ > - > - jnz 3f /* found NUL => return pointer */ > - > - movq %rax, (%rdx) /* Write value to destination. */ > - addq $8, %rdx /* Adjust pointer. */ > - jmp 1b /* Next iteration. */ > - > - /* Do the last few bytes. %rax contains the value to write. > - The loop is unrolled twice. */ > - .p2align 4 > -3: > - /* Note that stpcpy needs to return with the value of the NUL > - byte. */ > - movb %al, (%rdx) /* 1st byte. */ > - testb %al, %al /* Is it NUL. */ > - jz 4f /* yes, finish. */ > - incq %rdx /* Increment destination. */ > - movb %ah, (%rdx) /* 2nd byte. */ > - testb %ah, %ah /* Is it NUL?. */ > - jz 4f /* yes, finish. */ > - incq %rdx /* Increment destination. */ > - shrq $16, %rax /* Shift... */ > - jmp 3b /* and look at next two bytes in %rax. */ > - > -4: > -#ifdef USE_AS_STPCPY > - movq %rdx, %rax /* Destination is return value. */ > -#else > - movq %rdi, %rax /* Source is return value. */ > -#endif > - retq > -END (STRCPY) > -#ifndef USE_AS_STPCPY > +#define STRCPY strcpy > +#include "multiarch/strcpy-sse2.S" > libc_hidden_builtin_def (strcpy) > -#endif > -- > 2.34.1 > LGTM. Thanks. -- H.J.