public inbox for glibc-cvs@sourceware.org
help / color / mirror / Atom feed
* [glibc] x86: Move strcat SSE2 implementation to multiarch/strcat-sse2.S
@ 2022-07-13 22:55 Noah Goldstein
  0 siblings, 0 replies; only message in thread
From: Noah Goldstein @ 2022-07-13 22:55 UTC (permalink / raw)
  To: glibc-cvs

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=72a48ec0f78c7fd948fe476eb41f69c071f48964

commit 72a48ec0f78c7fd948fe476eb41f69c071f48964
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Tue Jul 12 12:29:06 2022 -0700

    x86: Move strcat SSE2 implementation to multiarch/strcat-sse2.S
    
    This commit doesn't affect libc.so.6, its just housekeeping to prepare
    for adding explicit ISA level support.
    
    Tested build on x86_64 and x86_32 with/without multiarch.

Diff:
---
 sysdeps/x86_64/multiarch/strcat-sse2.S | 242 ++++++++++++++++++++++++++++++++-
 sysdeps/x86_64/strcat.S                | 239 +-------------------------------
 2 files changed, 238 insertions(+), 243 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strcat-sse2.S b/sysdeps/x86_64/multiarch/strcat-sse2.S
index 449e102438..244c4a6d74 100644
--- a/sysdeps/x86_64/multiarch/strcat-sse2.S
+++ b/sysdeps/x86_64/multiarch/strcat-sse2.S
@@ -17,12 +17,242 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
+# ifndef STRCAT
+#  define STRCAT __strcat_sse2
+# endif
+#endif
 
-# include <sysdep.h>
-# define strcat __strcat_sse2
+#include <sysdep.h>
+
+	.text
+ENTRY (STRCAT)
+	movq %rdi, %rcx		/* Dest. register. */
+	andl $7, %ecx		/* mask alignment bits */
+	movq %rdi, %rax		/* Duplicate destination pointer.  */
+	movq $0xfefefefefefefeff,%r8
+
+	/* First step: Find end of destination.  */
+	jz 4f			/* aligned => start loop */
+
+	neg %ecx		/* We need to align to 8 bytes.  */
+	addl $8,%ecx
+	/* Search the first bytes directly.  */
+0:	cmpb $0x0,(%rax)	/* is byte NUL? */
+	je 2f			/* yes => start copy */
+	incq %rax		/* increment pointer */
+	decl %ecx
+	jnz 0b
+
+
+
+	/* Now the source is aligned.  Scan for NUL byte.  */
+	.p2align 4
+4:
+	/* First unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 3f			/* found NUL => return pointer */
+
+	/* Second unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 3f			/* found NUL => return pointer */
+
+	/* Third unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 3f			/* found NUL => return pointer */
+
+	/* Fourth unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz 4b			/* no NUL found => continue loop */
+
+	.p2align 4		/* Align, it's a jump target.  */
+3:	subq $8,%rax		/* correct pointer increment.  */
+
+	testb %cl, %cl		/* is first byte NUL? */
+	jz 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+
+	testb %ch, %ch		/* is second byte NUL? */
+	jz 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+
+	testl $0x00ff0000, %ecx /* is third byte NUL? */
+	jz 2f			/* yes => return pointer */
+	incq %rax		/* increment pointer */
+
+	testl $0xff000000, %ecx /* is fourth byte NUL? */
+	jz 2f			/* yes => return pointer */
+	incq %rax		/* increment pointer */
+
+	shrq $32, %rcx		/* look at other half.  */
+
+	testb %cl, %cl		/* is first byte NUL? */
+	jz 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+
+	testb %ch, %ch		/* is second byte NUL? */
+	jz 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+
+	testl $0xff0000, %ecx	/* is third byte NUL? */
+	jz 2f			/* yes => return pointer */
+	incq %rax		/* increment pointer */
+
+2:
+	/* Second step: Copy source to destination.  */
+
+	movq	%rsi, %rcx	/* duplicate  */
+	andl	$7,%ecx		/* mask alignment bits */
+	movq	%rax, %rdx	/* move around */
+	jz	22f		/* aligned => start loop */
+
+	neg	%ecx		/* align to 8 bytes.  */
+	addl	$8, %ecx
+	/* Align the source pointer.  */
+21:
+	movb	(%rsi), %al	/* Fetch a byte */
+	testb	%al, %al	/* Is it NUL? */
+	movb	%al, (%rdx)	/* Store it */
+	jz	24f		/* If it was NUL, done! */
+	incq	%rsi
+	incq	%rdx
+	decl	%ecx
+	jnz	21b
+
+	/* Now the sources is aligned.  Unfortunatly we cannot force
+	   to have both source and destination aligned, so ignore the
+	   alignment of the destination.  */
+	.p2align 4
+22:
+	/* 1st unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	23f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	23f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 2nd unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	23f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	23f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 3rd unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	23f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	23f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 4th unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	23f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	23f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+	jmp	22b		/* Next iteration.  */
+
+	/* Do the last few bytes. %rax contains the value to write.
+	   The loop is unrolled twice.  */
+	.p2align 4
+23:
+	movb	%al, (%rdx)	/* 1st byte.  */
+	testb	%al, %al	/* Is it NUL.  */
+	jz	24f		/* yes, finish.  */
+	incq	%rdx		/* Increment destination.  */
+	movb	%ah, (%rdx)	/* 2nd byte.  */
+	testb	%ah, %ah	/* Is it NUL?.  */
+	jz	24f		/* yes, finish.  */
+	incq	%rdx		/* Increment destination.  */
+	shrq	$16, %rax	/* Shift...  */
+	jmp	23b		/* and look at next two bytes in %rax.  */
 
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strcat)
-#endif
 
-#include <sysdeps/x86_64/strcat.S>
+24:
+	movq	%rdi, %rax	/* Source is return value.  */
+	retq
+END (STRCAT)
diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S
index 565a9c785a..fc3e8a9bcf 100644
--- a/sysdeps/x86_64/strcat.S
+++ b/sysdeps/x86_64/strcat.S
@@ -17,241 +17,6 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-/* Will be removed when new strcpy implementation gets merged.  */
-
-	.text
-ENTRY (strcat)
-	movq %rdi, %rcx		/* Dest. register. */
-	andl $7, %ecx		/* mask alignment bits */
-	movq %rdi, %rax		/* Duplicate destination pointer.  */
-	movq $0xfefefefefefefeff,%r8
-
-	/* First step: Find end of destination.  */
-	jz 4f			/* aligned => start loop */
-
-	neg %ecx		/* We need to align to 8 bytes.  */
-	addl $8,%ecx
-	/* Search the first bytes directly.  */
-0:	cmpb $0x0,(%rax)	/* is byte NUL? */
-	je 2f			/* yes => start copy */
-	incq %rax		/* increment pointer */
-	decl %ecx
-	jnz 0b
-
-
-
-	/* Now the source is aligned.  Scan for NUL byte.  */
-	.p2align 4
-4:
-	/* First unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Second unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Third unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Fourth unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jz 4b			/* no NUL found => continue loop */
-
-	.p2align 4		/* Align, it's a jump target.  */
-3:	subq $8,%rax		/* correct pointer increment.  */
-
-	testb %cl, %cl		/* is first byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testl $0x00ff0000, %ecx /* is third byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-
-	testl $0xff000000, %ecx /* is fourth byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-
-	shrq $32, %rcx		/* look at other half.  */
-
-	testb %cl, %cl		/* is first byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testl $0xff0000, %ecx	/* is third byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-
-2:
-	/* Second step: Copy source to destination.  */
-
-	movq	%rsi, %rcx	/* duplicate  */
-	andl	$7,%ecx		/* mask alignment bits */
-	movq	%rax, %rdx	/* move around */
-	jz	22f		/* aligned => start loop */
-
-	neg	%ecx		/* align to 8 bytes.  */
-	addl	$8, %ecx
-	/* Align the source pointer.  */
-21:
-	movb	(%rsi), %al	/* Fetch a byte */
-	testb	%al, %al	/* Is it NUL? */
-	movb	%al, (%rdx)	/* Store it */
-	jz	24f		/* If it was NUL, done! */
-	incq	%rsi
-	incq	%rdx
-	decl	%ecx
-	jnz	21b
-
-	/* Now the sources is aligned.  Unfortunatly we cannot force
-	   to have both source and destination aligned, so ignore the
-	   alignment of the destination.  */
-	.p2align 4
-22:
-	/* 1st unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	23f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	23f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdx)	/* Write value to destination.  */
-	addq	$8, %rdx	/* Adjust pointer.  */
-
-	/* 2nd unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	23f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	23f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdx)	/* Write value to destination.  */
-	addq	$8, %rdx	/* Adjust pointer.  */
-
-	/* 3rd unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	23f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	23f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdx)	/* Write value to destination.  */
-	addq	$8, %rdx	/* Adjust pointer.  */
-
-	/* 4th unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	23f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	23f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdx)	/* Write value to destination.  */
-	addq	$8, %rdx	/* Adjust pointer.  */
-	jmp	22b		/* Next iteration.  */
-
-	/* Do the last few bytes. %rax contains the value to write.
-	   The loop is unrolled twice.  */
-	.p2align 4
-23:
-	movb	%al, (%rdx)	/* 1st byte.  */
-	testb	%al, %al	/* Is it NUL.  */
-	jz	24f		/* yes, finish.  */
-	incq	%rdx		/* Increment destination.  */
-	movb	%ah, (%rdx)	/* 2nd byte.  */
-	testb	%ah, %ah	/* Is it NUL?.  */
-	jz	24f		/* yes, finish.  */
-	incq	%rdx		/* Increment destination.  */
-	shrq	$16, %rax	/* Shift...  */
-	jmp	23b		/* and look at next two bytes in %rax.  */
-
-
-24:
-	movq	%rdi, %rax	/* Source is return value.  */
-	retq
-END (strcat)
+#define STRCAT strcat
+#include "multiarch/strcat-sse2.S"
 libc_hidden_builtin_def (strcat)


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-07-13 22:55 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-13 22:55 [glibc] x86: Move strcat SSE2 implementation to multiarch/strcat-sse2.S Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).