[PATCH v2 1/2] x86: Refactor and improve performance of strchr-avx2.S

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

From: noah <goldstein.w.n@gmail.com>
To: libc-alpha@sourceware.org
Cc: carlos@systemhalted.org, goldstein.w.n@gmail.com, hjl.tools@gmail.com
Subject: [PATCH v2 1/2] x86: Refactor and improve performance of strchr-avx2.S
Date: Sun, 31 Jan 2021 19:30:14 -0500	[thread overview]
Message-ID: <20210201003014.785099-1-goldstein.w.n@gmail.com> (raw)

No bug. Just seemed the performance could be improved a bit. Observed
and expected behavior are unchanged. Optimized body of main
loop. Updated page cross logic and optimized accordingly. Made a few
minor instruction selection modifications. No regressions in test
suite. Both test-strchrnul and test-strchr passed.

Signed-off-by: noah <goldstein.w.n@gmail.com>
---
Since V1 optimized more around smaller lengths. The original version
expected the 4x loop to be hit though the benchmarks in bench-strchr.c
indicate optimization for very short strings is most important.

Made the first 32 byte check expect to find either the end of the
string or character in question. As well increased number of vectors
in L(aligned_more) to 4. This does cost for most alignments if the 4x
loop is hit but is faster for strings < 128 byte.
    
 sysdeps/x86_64/multiarch/strchr-avx2.S | 247 ++++++++++++-------------
 sysdeps/x86_64/multiarch/strchr.c      |   1 +
 2 files changed, 124 insertions(+), 124 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index d416558d04..3012cb6ece 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -27,10 +27,12 @@
 # ifdef USE_AS_WCSCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMPEQ	vpcmpeqd
+#  define VPMINU	vpminud
 #  define CHAR_REG	esi
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMPEQ	vpcmpeqb
+#  define VPMINU	vpminub
 #  define CHAR_REG	sil
 # endif
 
@@ -39,19 +41,25 @@
 # endif
 
 # define VEC_SIZE 32
+# define PAGE_SIZE 4096
 
 	.section .text.avx,"ax",@progbits
 ENTRY (STRCHR)
-	movl	%edi, %ecx
+    movl	%edi, %ecx
+# ifndef USE_AS_STRCHRNUL
+	xorl	%edx, %edx
+# endif
+    
 	/* Broadcast CHAR to YMM0.  */
 	vmovd	%esi, %xmm0
 	vpxor	%xmm9, %xmm9, %xmm9
 	VPBROADCAST %xmm0, %ymm0
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
-
+    
+	/* Check if we cross page boundary with one vector load.  */
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(cross_page_boundary)
+    
 	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
 	   null byte.  */
 	vmovdqu	(%rdi), %ymm8
@@ -60,50 +68,27 @@ ENTRY (STRCHR)
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-
-	jmp	L(more_4x_vec)
-
-	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	vmovdqu	(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	/* Remove the leading bytes.  */
-	sarl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
+	jz	L(more_vecs)
+    tzcntl	%eax, %eax
 	/* Found CHAR or the null byte.  */
-	tzcntl	%eax, %eax
-	addq	%rcx, %rax
-# ifdef USE_AS_STRCHRNUL
 	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp     (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
 	ret
 
-	.p2align 4
+    .p2align 4
+L(more_vecs):    
+	/* Align data for aligned loads in the loop.  */
+    andq	$-VEC_SIZE, %rdi
 L(aligned_more):
-	addq	$VEC_SIZE, %rdi
 
-L(more_4x_vec):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vmovdqa	(%rdi), %ymm8
+	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+       since data is only aligned to VEC_SIZE.  */
+	vmovdqa	VEC_SIZE(%rdi), %ymm8
+    addq    $VEC_SIZE, %rdi
 	VPCMPEQ %ymm8, %ymm0, %ymm1
 	VPCMPEQ %ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
@@ -125,7 +110,7 @@ L(more_4x_vec):
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x2)
+	jnz	L(first_vec_x2)    
 
 	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
 	VPCMPEQ %ymm8, %ymm0, %ymm1
@@ -133,122 +118,136 @@ L(more_4x_vec):
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x3)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
-
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	vmovdqa	(%rdi), %ymm5
-	vmovdqa	VEC_SIZE(%rdi), %ymm6
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
-
-	VPCMPEQ %ymm5, %ymm0, %ymm1
-	VPCMPEQ %ymm6, %ymm0, %ymm2
-	VPCMPEQ %ymm7, %ymm0, %ymm3
-	VPCMPEQ %ymm8, %ymm0, %ymm4
-
-	VPCMPEQ %ymm5, %ymm9, %ymm5
-	VPCMPEQ %ymm6, %ymm9, %ymm6
-	VPCMPEQ %ymm7, %ymm9, %ymm7
-	VPCMPEQ %ymm8, %ymm9, %ymm8
-
-	vpor	%ymm1, %ymm5, %ymm1
-	vpor	%ymm2, %ymm6, %ymm2
-	vpor	%ymm3, %ymm7, %ymm3
-	vpor	%ymm4, %ymm8, %ymm4
-
-	vpor	%ymm1, %ymm2, %ymm5
-	vpor	%ymm3, %ymm4, %ymm6
+	jz	L(prep_loop_4x)
 
-	vpor	%ymm5, %ymm6, %ymm5
-
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
-	jmp	L(loop_4x_vec)
+    tzcntl	%eax, %eax
+    leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp     (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
 
-	.p2align 4
+    .p2align 4
 L(first_vec_x0):
+    tzcntl	%eax, %eax
 	/* Found CHAR or the null byte.  */
-	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
 	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp     (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
 	ret
-
+    
 	.p2align 4
 L(first_vec_x1):
-	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
-	addq	$VEC_SIZE, %rax
-	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	VEC_SIZE(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+    tzcntl	%eax, %eax
+    leaq	VEC_SIZE(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp     (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
-	ret
-
-	.p2align 4
+	ret    
+    
+    .p2align 4
 L(first_vec_x2):
-	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
-	addq	$(VEC_SIZE * 2), %rax
-	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
+    tzcntl	%eax, %eax
+	/* Found CHAR or the null byte.  */
 	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp     (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
 	ret
+    
+L(prep_loop_4x):
+    /* Align data to 4 * VEC_SIZE.  */
+	andq	$-(VEC_SIZE * 4), %rdi
 
 	.p2align 4
-L(4x_vec_end):
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
+	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
+	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
+
+    /* Leaves only CHARS matching esi as 0.  */
+    vpxor   %ymm5, %ymm0, %ymm1
+    vpxor   %ymm6, %ymm0, %ymm2
+    vpxor   %ymm7, %ymm0, %ymm3
+    vpxor   %ymm8, %ymm0, %ymm4
+
+	VPMINU	%ymm1, %ymm5, %ymm1
+	VPMINU	%ymm2, %ymm6, %ymm2
+	VPMINU	%ymm3, %ymm7, %ymm3
+	VPMINU	%ymm4, %ymm8, %ymm4
+
+	VPMINU	%ymm1, %ymm2, %ymm5
+	VPMINU	%ymm3, %ymm4, %ymm6
+
+	VPMINU	%ymm5, %ymm6, %ymm5
+
+    VPCMPEQ %ymm5, %ymm9, %ymm5
+	vpmovmskb %ymm5, %eax
+
+    addq	$(VEC_SIZE * 4), %rdi
+	testl	%eax, %eax
+    jz	L(loop_4x_vec)
+    
+    VPCMPEQ %ymm1, %ymm9, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x0)
-	vpmovmskb %ymm2, %eax
+
+    VPCMPEQ %ymm2, %ymm9, %ymm2
+    vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
+
+    VPCMPEQ %ymm3, %ymm9, %ymm3
+    VPCMPEQ %ymm4, %ymm9, %ymm4
+	vpmovmskb %ymm3, %ecx
 	vpmovmskb %ymm4, %eax
+    salq    $32, %rax
+    orq     %rcx, %rax
+	tzcntq	%rax, %rax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp     (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+
+    /* Cold case for crossing page with first load.  */
+	.p2align 4
+L(cross_page_boundary):
+    andq	$-VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+
+	vmovdqa	(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Remove the leading bits.  */
+	sarxl	%ecx, %eax, %eax
 	testl	%eax, %eax
-L(first_vec_x3):
+	jz	L(aligned_more)    
 	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
-	addq	$(VEC_SIZE * 3), %rax
+    addq	%rcx, %rdi
 	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp     (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
 	ret
 
 END (STRCHR)
-#endif
+# endif
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
index 583a152794..4dfbe3b58b 100644
--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
 
   if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
       && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     return OPTIMIZE (avx2);
 
-- 
2.29.2

next             reply	other threads:[~2021-02-01  0:30 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-02-01  0:30 noah [this message]
2021-02-01  0:30 ` [PATCH v2 2/2] x86: Add additional benchmarks for strchr noah
2021-02-01 17:10   ` H.J. Lu
2021-02-01 17:08 ` [PATCH v2 1/2] x86: Refactor and improve performance of strchr-avx2.S H.J. Lu
2021-02-02  7:23 ` [PATCH v3 " goldstein.w.n

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210201003014.785099-1-goldstein.w.n@gmail.com \
    --to=goldstein.w.n@gmail.com \
    --cc=carlos@systemhalted.org \
    --cc=hjl.tools@gmail.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).