[PATCH v2 1/2] x86: Refactor and improve performance of strchr-avx2.S

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH v2 1/2] x86: Refactor and improve performance of strchr-avx2.S
@ 2021-02-01  0:30 noah
  2021-02-01  0:30 ` [PATCH v2 2/2] x86: Add additional benchmarks for strchr noah
                   ` (2 more replies)
  0 siblings, 3 replies; 5+ messages in thread
From: noah @ 2021-02-01  0:30 UTC (permalink / raw)
  To: libc-alpha; +Cc: carlos, goldstein.w.n, hjl.tools

No bug. Just seemed the performance could be improved a bit. Observed
and expected behavior are unchanged. Optimized body of main
loop. Updated page cross logic and optimized accordingly. Made a few
minor instruction selection modifications. No regressions in test
suite. Both test-strchrnul and test-strchr passed.

Signed-off-by: noah <goldstein.w.n@gmail.com>
---
Since V1 optimized more around smaller lengths. The original version
expected the 4x loop to be hit though the benchmarks in bench-strchr.c
indicate optimization for very short strings is most important.

Made the first 32 byte check expect to find either the end of the
string or character in question. As well increased number of vectors
in L(aligned_more) to 4. This does cost for most alignments if the 4x
loop is hit but is faster for strings < 128 byte.
    
 sysdeps/x86_64/multiarch/strchr-avx2.S | 247 ++++++++++++-------------
 sysdeps/x86_64/multiarch/strchr.c      |   1 +
 2 files changed, 124 insertions(+), 124 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index d416558d04..3012cb6ece 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -27,10 +27,12 @@
 # ifdef USE_AS_WCSCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMPEQ	vpcmpeqd
+#  define VPMINU	vpminud
 #  define CHAR_REG	esi
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMPEQ	vpcmpeqb
+#  define VPMINU	vpminub
 #  define CHAR_REG	sil
 # endif
 
@@ -39,19 +41,25 @@
 # endif
 
 # define VEC_SIZE 32
+# define PAGE_SIZE 4096
 
 	.section .text.avx,"ax",@progbits
 ENTRY (STRCHR)
-	movl	%edi, %ecx
+    movl	%edi, %ecx
+# ifndef USE_AS_STRCHRNUL
+	xorl	%edx, %edx
+# endif
+    
 	/* Broadcast CHAR to YMM0.  */
 	vmovd	%esi, %xmm0
 	vpxor	%xmm9, %xmm9, %xmm9
 	VPBROADCAST %xmm0, %ymm0
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
-
+    
+	/* Check if we cross page boundary with one vector load.  */
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(cross_page_boundary)
+    
 	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
 	   null byte.  */
 	vmovdqu	(%rdi), %ymm8
@@ -60,50 +68,27 @@ ENTRY (STRCHR)
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-
-	jmp	L(more_4x_vec)
-
-	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	vmovdqu	(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	/* Remove the leading bytes.  */
-	sarl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
+	jz	L(more_vecs)
+    tzcntl	%eax, %eax
 	/* Found CHAR or the null byte.  */
-	tzcntl	%eax, %eax
-	addq	%rcx, %rax
-# ifdef USE_AS_STRCHRNUL
 	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp     (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
 	ret
 
-	.p2align 4
+    .p2align 4
+L(more_vecs):    
+	/* Align data for aligned loads in the loop.  */
+    andq	$-VEC_SIZE, %rdi
 L(aligned_more):
-	addq	$VEC_SIZE, %rdi
 
-L(more_4x_vec):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vmovdqa	(%rdi), %ymm8
+	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+       since data is only aligned to VEC_SIZE.  */
+	vmovdqa	VEC_SIZE(%rdi), %ymm8
+    addq    $VEC_SIZE, %rdi
 	VPCMPEQ %ymm8, %ymm0, %ymm1
 	VPCMPEQ %ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
@@ -125,7 +110,7 @@ L(more_4x_vec):
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x2)
+	jnz	L(first_vec_x2)    
 
 	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
 	VPCMPEQ %ymm8, %ymm0, %ymm1
@@ -133,122 +118,136 @@ L(more_4x_vec):
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x3)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
-
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	vmovdqa	(%rdi), %ymm5
-	vmovdqa	VEC_SIZE(%rdi), %ymm6
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
-
-	VPCMPEQ %ymm5, %ymm0, %ymm1
-	VPCMPEQ %ymm6, %ymm0, %ymm2
-	VPCMPEQ %ymm7, %ymm0, %ymm3
-	VPCMPEQ %ymm8, %ymm0, %ymm4
-
-	VPCMPEQ %ymm5, %ymm9, %ymm5
-	VPCMPEQ %ymm6, %ymm9, %ymm6
-	VPCMPEQ %ymm7, %ymm9, %ymm7
-	VPCMPEQ %ymm8, %ymm9, %ymm8
-
-	vpor	%ymm1, %ymm5, %ymm1
-	vpor	%ymm2, %ymm6, %ymm2
-	vpor	%ymm3, %ymm7, %ymm3
-	vpor	%ymm4, %ymm8, %ymm4
-
-	vpor	%ymm1, %ymm2, %ymm5
-	vpor	%ymm3, %ymm4, %ymm6
+	jz	L(prep_loop_4x)
 
-	vpor	%ymm5, %ymm6, %ymm5
-
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
-	jmp	L(loop_4x_vec)
+    tzcntl	%eax, %eax
+    leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp     (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
 
-	.p2align 4
+    .p2align 4
 L(first_vec_x0):
+    tzcntl	%eax, %eax
 	/* Found CHAR or the null byte.  */
-	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
 	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp     (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
 	ret
-
+    
 	.p2align 4
 L(first_vec_x1):
-	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
-	addq	$VEC_SIZE, %rax
-	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	VEC_SIZE(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+    tzcntl	%eax, %eax
+    leaq	VEC_SIZE(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp     (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
-	ret
-
-	.p2align 4
+	ret    
+    
+    .p2align 4
 L(first_vec_x2):
-	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
-	addq	$(VEC_SIZE * 2), %rax
-	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
+    tzcntl	%eax, %eax
+	/* Found CHAR or the null byte.  */
 	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp     (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
 	ret
+    
+L(prep_loop_4x):
+    /* Align data to 4 * VEC_SIZE.  */
+	andq	$-(VEC_SIZE * 4), %rdi
 
 	.p2align 4
-L(4x_vec_end):
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
+	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
+	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
+
+    /* Leaves only CHARS matching esi as 0.  */
+    vpxor   %ymm5, %ymm0, %ymm1
+    vpxor   %ymm6, %ymm0, %ymm2
+    vpxor   %ymm7, %ymm0, %ymm3
+    vpxor   %ymm8, %ymm0, %ymm4
+
+	VPMINU	%ymm1, %ymm5, %ymm1
+	VPMINU	%ymm2, %ymm6, %ymm2
+	VPMINU	%ymm3, %ymm7, %ymm3
+	VPMINU	%ymm4, %ymm8, %ymm4
+
+	VPMINU	%ymm1, %ymm2, %ymm5
+	VPMINU	%ymm3, %ymm4, %ymm6
+
+	VPMINU	%ymm5, %ymm6, %ymm5
+
+    VPCMPEQ %ymm5, %ymm9, %ymm5
+	vpmovmskb %ymm5, %eax
+
+    addq	$(VEC_SIZE * 4), %rdi
+	testl	%eax, %eax
+    jz	L(loop_4x_vec)
+    
+    VPCMPEQ %ymm1, %ymm9, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x0)
-	vpmovmskb %ymm2, %eax
+
+    VPCMPEQ %ymm2, %ymm9, %ymm2
+    vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
+
+    VPCMPEQ %ymm3, %ymm9, %ymm3
+    VPCMPEQ %ymm4, %ymm9, %ymm4
+	vpmovmskb %ymm3, %ecx
 	vpmovmskb %ymm4, %eax
+    salq    $32, %rax
+    orq     %rcx, %rax
+	tzcntq	%rax, %rax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp     (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+
+    /* Cold case for crossing page with first load.  */
+	.p2align 4
+L(cross_page_boundary):
+    andq	$-VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+
+	vmovdqa	(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Remove the leading bits.  */
+	sarxl	%ecx, %eax, %eax
 	testl	%eax, %eax
-L(first_vec_x3):
+	jz	L(aligned_more)    
 	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
-	addq	$(VEC_SIZE * 3), %rax
+    addq	%rcx, %rdi
 	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp     (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
 	ret
 
 END (STRCHR)
-#endif
+# endif
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
index 583a152794..4dfbe3b58b 100644
--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
 
   if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
       && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     return OPTIMIZE (avx2);
 
-- 
2.29.2


^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v2 2/2] x86: Add additional benchmarks for strchr
  2021-02-01  0:30 [PATCH v2 1/2] x86: Refactor and improve performance of strchr-avx2.S noah
@ 2021-02-01  0:30 ` noah
  2021-02-01 17:10   ` H.J. Lu
  2021-02-01 17:08 ` [PATCH v2 1/2] x86: Refactor and improve performance of strchr-avx2.S H.J. Lu
  2021-02-02  7:23 ` [PATCH v3 " goldstein.w.n
  2 siblings, 1 reply; 5+ messages in thread
From: noah @ 2021-02-01  0:30 UTC (permalink / raw)
  To: libc-alpha; +Cc: carlos, goldstein.w.n, hjl.tools

This patch adds additional benchmarks for string size of 4096 and
several benchmarks for string size 256 with different alignments.

Signed-off-by: noah <goldstein.w.n@gmail.com>
---
Added 2 additional benchmark sizes:

4096: Just feels like a natural "large" size to test
    
256 with multiple alignments: This essentially is to test how
expensive the initial work prior to the 4x loop is depending on
different alignments.

results from bench-strchr: All times are in seconds and the medium of
100 runs.  Old is current strchr-avx2.S implementation. New is this
patch.

Summary: New is definetly faster for medium -> large sizes. Once the
4x loop is hit there is a 10%+ speedup and New always wins out. For
smaller sizes there is more variance as to which is faster and the
differences are small. Generally it seems the New version wins
out. This is likely because 0 - 31 sized strings are the fast path for
new (no jmp).

Benchmarking CPU:
Icelake: Intel(R) Core(TM) i7-1065G7 CPU @ 1.30GHz

size, algn, Old T , New T  -------- Win  Dif
0   , 0   , 2.54  , 2.52   -------- New  -0.02
1   , 0   , 2.57  , 2.52   -------- New  -0.05
2   , 0   , 2.56  , 2.52   -------- New  -0.04
3   , 0   , 2.58  , 2.54   -------- New  -0.04
4   , 0   , 2.61  , 2.55   -------- New  -0.06
5   , 0   , 2.65  , 2.62   -------- New  -0.03
6   , 0   , 2.73  , 2.74   -------- Old  -0.01
7   , 0   , 2.75  , 2.74   -------- New  -0.01
8   , 0   , 2.62  , 2.6    -------- New  -0.02
9   , 0   , 2.73  , 2.75   -------- Old  -0.02
10  , 0   , 2.74  , 2.74   -------- Eq    N/A
11  , 0   , 2.76  , 2.72   -------- New  -0.04
12  , 0   , 2.74  , 2.72   -------- New  -0.02
13  , 0   , 2.75  , 2.72   -------- New  -0.03
14  , 0   , 2.74  , 2.73   -------- New  -0.01
15  , 0   , 2.74  , 2.73   -------- New  -0.01
16  , 0   , 2.74  , 2.73   -------- New  -0.01
17  , 0   , 2.74  , 2.74   -------- Eq    N/A
18  , 0   , 2.73  , 2.73   -------- Eq    N/A
19  , 0   , 2.73  , 2.73   -------- Eq    N/A
20  , 0   , 2.73  , 2.73   -------- Eq    N/A
21  , 0   , 2.73  , 2.72   -------- New  -0.01
22  , 0   , 2.71  , 2.74   -------- Old  -0.03
23  , 0   , 2.71  , 2.69   -------- New  -0.02
24  , 0   , 2.68  , 2.67   -------- New  -0.01
25  , 0   , 2.66  , 2.62   -------- New  -0.04
26  , 0   , 2.64  , 2.62   -------- New  -0.02
27  , 0   , 2.71  , 2.64   -------- New  -0.07
28  , 0   , 2.67  , 2.69   -------- Old  -0.02
29  , 0   , 2.72  , 2.72   -------- Eq    N/A
30  , 0   , 2.68  , 2.69   -------- Old  -0.01
31  , 0   , 2.68  , 2.68   -------- Eq    N/A
32  , 0   , 3.51  , 3.52   -------- Old  -0.01
32  , 1   , 3.52  , 3.51   -------- New  -0.01
64  , 0   , 3.97  , 3.93   -------- New  -0.04
64  , 2   , 3.95  , 3.9    -------- New  -0.05
64  , 1   , 4.0   , 3.93   -------- New  -0.07
64  , 3   , 3.97  , 3.88   -------- New  -0.09
64  , 4   , 3.95  , 3.89   -------- New  -0.06
64  , 5   , 3.94  , 3.9    -------- New  -0.04
64  , 6   , 3.97  , 3.9    -------- New  -0.07
64  , 7   , 3.97  , 3.91   -------- New  -0.06
96  , 0   , 4.74  , 4.52   -------- New  -0.22
128 , 0   , 5.29  , 5.19   -------- New  -0.1
128 , 2   , 5.29  , 5.15   -------- New  -0.14
128 , 3   , 5.31  , 5.22   -------- New  -0.09
256 , 0   , 11.19 , 9.81   -------- New  -1.38
256 , 3   , 11.19 , 9.84   -------- New  -1.35
256 , 4   , 11.2  , 9.88   -------- New  -1.32
256 , 16  , 11.21 , 9.79   -------- New  -1.42
256 , 32  , 11.39 , 10.34  -------- New  -1.05
256 , 48  , 11.88 , 10.56  -------- New  -1.32
256 , 64  , 11.82 , 10.83  -------- New  -0.99
256 , 80  , 11.85 , 10.86  -------- New  -0.99
256 , 96  , 9.56  , 8.76   -------- New  -0.8 
256 , 112 , 9.55  , 8.9    -------- New  -0.65
512 , 0   , 15.76 , 13.72  -------- New  -2.04
512 , 4   , 15.72 , 13.74  -------- New  -1.98
512 , 5   , 15.73 , 13.74  -------- New  -1.99
1024, 0   , 24.85 , 21.33  -------- New  -3.52
1024, 5   , 24.86 , 21.27  -------- New  -3.59
1024, 6   , 24.87 , 21.32  -------- New  -3.55
2048, 0   , 45.75 , 36.7   -------- New  -9.05
2048, 6   , 43.91 , 35.42  -------- New  -8.49
2048, 7   , 44.43 , 36.37  -------- New  -8.06
4096, 0   , 96.94 , 81.34  -------- New  -15.6
4096, 7   , 97.01 , 81.32  -------- New  -15.69


    
 benchtests/bench-strchr.c | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
index bf493fe458..5fd98a5d43 100644
--- a/benchtests/bench-strchr.c
+++ b/benchtests/bench-strchr.c
@@ -100,9 +100,13 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
   size_t i;
   CHAR *result;
   CHAR *buf = (CHAR *) buf1;
-  align &= 15;
+
+  align &= 127;
   if ((align + len) * sizeof (CHAR) >= page_size)
-    return;
+    {
+      return;                
+    }
+
 
   for (i = 0; i < len; ++i)
     {
@@ -151,12 +155,24 @@ test_main (void)
       do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
     }
 
+  for (i = 1; i < 8; ++i)
+    {
+      do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
+      do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
+    }
+
   for (i = 1; i < 8; ++i)
     {
       do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
       do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
     }
 
+  for (i = 0; i < 8; ++i)
+    {
+      do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
+      do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
+    }
+
   for (i = 0; i < 32; ++i)
     {
       do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
@@ -169,12 +185,24 @@ test_main (void)
       do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
     }
 
+  for (i = 1; i < 8; ++i)
+    {
+      do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
+      do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
+    }
+
   for (i = 1; i < 8; ++i)
     {
       do_test (i, 64, 256, 0, MIDDLE_CHAR);
       do_test (i, 64, 256, 0, BIG_CHAR);
     }
 
+  for (i = 0; i < 8; ++i)
+    {
+      do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
+      do_test (16 * i, 256, 512, 0, BIG_CHAR);
+    }
+
   for (i = 0; i < 32; ++i)
     {
       do_test (0, i, i + 1, 0, MIDDLE_CHAR);
-- 
2.29.2


^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2 2/2] x86: Add additional benchmarks for strchr
  2021-02-01  0:30 ` [PATCH v2 2/2] x86: Add additional benchmarks for strchr noah
@ 2021-02-01 17:10   ` H.J. Lu
  0 siblings, 0 replies; 5+ messages in thread
From: H.J. Lu @ 2021-02-01 17:10 UTC (permalink / raw)
  To: noah; +Cc: GNU C Library, Carlos O'Donell

On Sun, Jan 31, 2021 at 4:30 PM noah <goldstein.w.n@gmail.com> wrote:
>
> This patch adds additional benchmarks for string size of 4096 and
> several benchmarks for string size 256 with different alignments.
>
> Signed-off-by: noah <goldstein.w.n@gmail.com>
> ---
> Added 2 additional benchmark sizes:
>
> 4096: Just feels like a natural "large" size to test
>
> 256 with multiple alignments: This essentially is to test how
> expensive the initial work prior to the 4x loop is depending on
> different alignments.
>
> results from bench-strchr: All times are in seconds and the medium of
> 100 runs.  Old is current strchr-avx2.S implementation. New is this
> patch.
>
> Summary: New is definetly faster for medium -> large sizes. Once the
> 4x loop is hit there is a 10%+ speedup and New always wins out. For
> smaller sizes there is more variance as to which is faster and the
> differences are small. Generally it seems the New version wins
> out. This is likely because 0 - 31 sized strings are the fast path for
> new (no jmp).
>
> Benchmarking CPU:
> Icelake: Intel(R) Core(TM) i7-1065G7 CPU @ 1.30GHz
>
> size, algn, Old T , New T  -------- Win  Dif
> 0   , 0   , 2.54  , 2.52   -------- New  -0.02
> 1   , 0   , 2.57  , 2.52   -------- New  -0.05
> 2   , 0   , 2.56  , 2.52   -------- New  -0.04
> 3   , 0   , 2.58  , 2.54   -------- New  -0.04
> 4   , 0   , 2.61  , 2.55   -------- New  -0.06
> 5   , 0   , 2.65  , 2.62   -------- New  -0.03
> 6   , 0   , 2.73  , 2.74   -------- Old  -0.01
> 7   , 0   , 2.75  , 2.74   -------- New  -0.01
> 8   , 0   , 2.62  , 2.6    -------- New  -0.02
> 9   , 0   , 2.73  , 2.75   -------- Old  -0.02
> 10  , 0   , 2.74  , 2.74   -------- Eq    N/A
> 11  , 0   , 2.76  , 2.72   -------- New  -0.04
> 12  , 0   , 2.74  , 2.72   -------- New  -0.02
> 13  , 0   , 2.75  , 2.72   -------- New  -0.03
> 14  , 0   , 2.74  , 2.73   -------- New  -0.01
> 15  , 0   , 2.74  , 2.73   -------- New  -0.01
> 16  , 0   , 2.74  , 2.73   -------- New  -0.01
> 17  , 0   , 2.74  , 2.74   -------- Eq    N/A
> 18  , 0   , 2.73  , 2.73   -------- Eq    N/A
> 19  , 0   , 2.73  , 2.73   -------- Eq    N/A
> 20  , 0   , 2.73  , 2.73   -------- Eq    N/A
> 21  , 0   , 2.73  , 2.72   -------- New  -0.01
> 22  , 0   , 2.71  , 2.74   -------- Old  -0.03
> 23  , 0   , 2.71  , 2.69   -------- New  -0.02
> 24  , 0   , 2.68  , 2.67   -------- New  -0.01
> 25  , 0   , 2.66  , 2.62   -------- New  -0.04
> 26  , 0   , 2.64  , 2.62   -------- New  -0.02
> 27  , 0   , 2.71  , 2.64   -------- New  -0.07
> 28  , 0   , 2.67  , 2.69   -------- Old  -0.02
> 29  , 0   , 2.72  , 2.72   -------- Eq    N/A
> 30  , 0   , 2.68  , 2.69   -------- Old  -0.01
> 31  , 0   , 2.68  , 2.68   -------- Eq    N/A
> 32  , 0   , 3.51  , 3.52   -------- Old  -0.01
> 32  , 1   , 3.52  , 3.51   -------- New  -0.01
> 64  , 0   , 3.97  , 3.93   -------- New  -0.04
> 64  , 2   , 3.95  , 3.9    -------- New  -0.05
> 64  , 1   , 4.0   , 3.93   -------- New  -0.07
> 64  , 3   , 3.97  , 3.88   -------- New  -0.09
> 64  , 4   , 3.95  , 3.89   -------- New  -0.06
> 64  , 5   , 3.94  , 3.9    -------- New  -0.04
> 64  , 6   , 3.97  , 3.9    -------- New  -0.07
> 64  , 7   , 3.97  , 3.91   -------- New  -0.06
> 96  , 0   , 4.74  , 4.52   -------- New  -0.22
> 128 , 0   , 5.29  , 5.19   -------- New  -0.1
> 128 , 2   , 5.29  , 5.15   -------- New  -0.14
> 128 , 3   , 5.31  , 5.22   -------- New  -0.09
> 256 , 0   , 11.19 , 9.81   -------- New  -1.38
> 256 , 3   , 11.19 , 9.84   -------- New  -1.35
> 256 , 4   , 11.2  , 9.88   -------- New  -1.32
> 256 , 16  , 11.21 , 9.79   -------- New  -1.42
> 256 , 32  , 11.39 , 10.34  -------- New  -1.05
> 256 , 48  , 11.88 , 10.56  -------- New  -1.32
> 256 , 64  , 11.82 , 10.83  -------- New  -0.99
> 256 , 80  , 11.85 , 10.86  -------- New  -0.99
> 256 , 96  , 9.56  , 8.76   -------- New  -0.8
> 256 , 112 , 9.55  , 8.9    -------- New  -0.65
> 512 , 0   , 15.76 , 13.72  -------- New  -2.04
> 512 , 4   , 15.72 , 13.74  -------- New  -1.98
> 512 , 5   , 15.73 , 13.74  -------- New  -1.99
> 1024, 0   , 24.85 , 21.33  -------- New  -3.52
> 1024, 5   , 24.86 , 21.27  -------- New  -3.59
> 1024, 6   , 24.87 , 21.32  -------- New  -3.55
> 2048, 0   , 45.75 , 36.7   -------- New  -9.05
> 2048, 6   , 43.91 , 35.42  -------- New  -8.49
> 2048, 7   , 44.43 , 36.37  -------- New  -8.06
> 4096, 0   , 96.94 , 81.34  -------- New  -15.6
> 4096, 7   , 97.01 , 81.32  -------- New  -15.69
>
>
>
>  benchtests/bench-strchr.c | 32 ++++++++++++++++++++++++++++++--
>  1 file changed, 30 insertions(+), 2 deletions(-)
>
> diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
> index bf493fe458..5fd98a5d43 100644
> --- a/benchtests/bench-strchr.c
> +++ b/benchtests/bench-strchr.c
> @@ -100,9 +100,13 @@ do_test (size_t align, size_t pos, size_t len, int seek_char, int max_char)
>    size_t i;
>    CHAR *result;
>    CHAR *buf = (CHAR *) buf1;
> -  align &= 15;
> +
> +  align &= 127;
>    if ((align + len) * sizeof (CHAR) >= page_size)
> -    return;
> +    {
> +      return;
> +    }
> +
>
>    for (i = 0; i < len; ++i)
>      {
> @@ -151,12 +155,24 @@ test_main (void)
>        do_test (i, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
>      }
>
> +  for (i = 1; i < 8; ++i)
> +    {
> +      do_test (0, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> +      do_test (i, 16 << i, 4096, SMALL_CHAR, MIDDLE_CHAR);
> +    }
> +
>    for (i = 1; i < 8; ++i)
>      {
>        do_test (i, 64, 256, SMALL_CHAR, MIDDLE_CHAR);
>        do_test (i, 64, 256, SMALL_CHAR, BIG_CHAR);
>      }
>
> +  for (i = 0; i < 8; ++i)
> +    {
> +      do_test (16 * i, 256, 512, SMALL_CHAR, MIDDLE_CHAR);
> +      do_test (16 * i, 256, 512, SMALL_CHAR, BIG_CHAR);
> +    }
> +
>    for (i = 0; i < 32; ++i)
>      {
>        do_test (0, i, i + 1, SMALL_CHAR, MIDDLE_CHAR);
> @@ -169,12 +185,24 @@ test_main (void)
>        do_test (i, 16 << i, 2048, 0, MIDDLE_CHAR);
>      }
>
> +  for (i = 1; i < 8; ++i)
> +    {
> +      do_test (0, 16 << i, 4096, 0, MIDDLE_CHAR);
> +      do_test (i, 16 << i, 4096, 0, MIDDLE_CHAR);
> +    }
> +
>    for (i = 1; i < 8; ++i)
>      {
>        do_test (i, 64, 256, 0, MIDDLE_CHAR);
>        do_test (i, 64, 256, 0, BIG_CHAR);
>      }
>
> +  for (i = 0; i < 8; ++i)
> +    {
> +      do_test (16 * i, 256, 512, 0, MIDDLE_CHAR);
> +      do_test (16 * i, 256, 512, 0, BIG_CHAR);
> +    }
> +
>    for (i = 0; i < 32; ++i)
>      {
>        do_test (0, i, i + 1, 0, MIDDLE_CHAR);
> --
> 2.29.2

Please make the similar changes in string/test-strchr.c.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2 1/2] x86: Refactor and improve performance of strchr-avx2.S
  2021-02-01  0:30 [PATCH v2 1/2] x86: Refactor and improve performance of strchr-avx2.S noah
  2021-02-01  0:30 ` [PATCH v2 2/2] x86: Add additional benchmarks for strchr noah
@ 2021-02-01 17:08 ` H.J. Lu
  2021-02-02  7:23 ` [PATCH v3 " goldstein.w.n
  2 siblings, 0 replies; 5+ messages in thread
From: H.J. Lu @ 2021-02-01 17:08 UTC (permalink / raw)
  To: noah; +Cc: GNU C Library, Carlos O'Donell

On Sun, Jan 31, 2021 at 4:30 PM noah <goldstein.w.n@gmail.com> wrote:
>
> No bug. Just seemed the performance could be improved a bit. Observed
> and expected behavior are unchanged. Optimized body of main
> loop. Updated page cross logic and optimized accordingly. Made a few
> minor instruction selection modifications. No regressions in test
> suite. Both test-strchrnul and test-strchr passed.
>
> Signed-off-by: noah <goldstein.w.n@gmail.com>
> ---
> Since V1 optimized more around smaller lengths. The original version
> expected the 4x loop to be hit though the benchmarks in bench-strchr.c
> indicate optimization for very short strings is most important.
>
> Made the first 32 byte check expect to find either the end of the
> string or character in question. As well increased number of vectors
> in L(aligned_more) to 4. This does cost for most alignments if the 4x
> loop is hit but is faster for strings < 128 byte.
>
>  sysdeps/x86_64/multiarch/strchr-avx2.S | 247 ++++++++++++-------------
>  sysdeps/x86_64/multiarch/strchr.c      |   1 +
>  2 files changed, 124 insertions(+), 124 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
> index d416558d04..3012cb6ece 100644
> --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
> @@ -27,10 +27,12 @@
>  # ifdef USE_AS_WCSCHR
>  #  define VPBROADCAST  vpbroadcastd
>  #  define VPCMPEQ      vpcmpeqd
> +#  define VPMINU       vpminud
>  #  define CHAR_REG     esi
>  # else
>  #  define VPBROADCAST  vpbroadcastb
>  #  define VPCMPEQ      vpcmpeqb
> +#  define VPMINU       vpminub
>  #  define CHAR_REG     sil
>  # endif
>
> @@ -39,19 +41,25 @@
>  # endif
>
>  # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
>
>         .section .text.avx,"ax",@progbits
>  ENTRY (STRCHR)
> -       movl    %edi, %ecx
> +    movl       %edi, %ecx

You replaced a tab with 8 spaces.   Please put the tab back
and replace 8 spaces with a tab.

> +# ifndef USE_AS_STRCHRNUL
> +       xorl    %edx, %edx
> +# endif
> +
>         /* Broadcast CHAR to YMM0.  */
>         vmovd   %esi, %xmm0
>         vpxor   %xmm9, %xmm9, %xmm9
>         VPBROADCAST %xmm0, %ymm0
> -       /* Check if we may cross page boundary with one vector load.  */
> -       andl    $(2 * VEC_SIZE - 1), %ecx
> -       cmpl    $VEC_SIZE, %ecx
> -       ja      L(cros_page_boundary)
> -
> +
> +       /* Check if we cross page boundary with one vector load.  */
> +       andl    $(PAGE_SIZE - 1), %ecx
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> +       ja      L(cross_page_boundary)
> +
>         /* Check the first VEC_SIZE bytes.  Search for both CHAR and the
>            null byte.  */
>         vmovdqu (%rdi), %ymm8
> @@ -60,50 +68,27 @@ ENTRY (STRCHR)
>         vpor    %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
> -       jnz     L(first_vec_x0)
> -
> -       /* Align data for aligned loads in the loop.  */
> -       addq    $VEC_SIZE, %rdi
> -       andl    $(VEC_SIZE - 1), %ecx
> -       andq    $-VEC_SIZE, %rdi
> -
> -       jmp     L(more_4x_vec)
> -
> -       .p2align 4
> -L(cros_page_boundary):
> -       andl    $(VEC_SIZE - 1), %ecx
> -       andq    $-VEC_SIZE, %rdi
> -       vmovdqu (%rdi), %ymm8
> -       VPCMPEQ %ymm8, %ymm0, %ymm1
> -       VPCMPEQ %ymm8, %ymm9, %ymm2
> -       vpor    %ymm1, %ymm2, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       /* Remove the leading bytes.  */
> -       sarl    %cl, %eax
> -       testl   %eax, %eax
> -       jz      L(aligned_more)
> +       jz      L(more_vecs)
> +    tzcntl     %eax, %eax
>         /* Found CHAR or the null byte.  */
> -       tzcntl  %eax, %eax
> -       addq    %rcx, %rax
> -# ifdef USE_AS_STRCHRNUL
>         addq    %rdi, %rax
> -# else
> -       xorl    %edx, %edx
> -       leaq    (%rdi, %rax), %rax
> -       cmp     (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
>         cmovne  %rdx, %rax
>  # endif
>         VZEROUPPER
>         ret
>
> -       .p2align 4
> +    .p2align 4
> +L(more_vecs):
> +       /* Align data for aligned loads in the loop.  */
> +    andq       $-VEC_SIZE, %rdi
>  L(aligned_more):
> -       addq    $VEC_SIZE, %rdi
>
> -L(more_4x_vec):
> -       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -          since data is only aligned to VEC_SIZE.  */
> -       vmovdqa (%rdi), %ymm8
> +       /* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> +       since data is only aligned to VEC_SIZE.  */
> +       vmovdqa VEC_SIZE(%rdi), %ymm8
> +    addq    $VEC_SIZE, %rdi
>         VPCMPEQ %ymm8, %ymm0, %ymm1
>         VPCMPEQ %ymm8, %ymm9, %ymm2
>         vpor    %ymm1, %ymm2, %ymm1
> @@ -125,7 +110,7 @@ L(more_4x_vec):
>         vpor    %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
> -       jnz     L(first_vec_x2)
> +       jnz     L(first_vec_x2)
>
>         vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
>         VPCMPEQ %ymm8, %ymm0, %ymm1
> @@ -133,122 +118,136 @@ L(more_4x_vec):
>         vpor    %ymm1, %ymm2, %ymm1
>         vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
> -       jnz     L(first_vec_x3)
> -
> -       addq    $(VEC_SIZE * 4), %rdi
> -
> -       /* Align data to 4 * VEC_SIZE.  */
> -       movq    %rdi, %rcx
> -       andl    $(4 * VEC_SIZE - 1), %ecx
> -       andq    $-(4 * VEC_SIZE), %rdi
> -
> -       .p2align 4
> -L(loop_4x_vec):
> -       /* Compare 4 * VEC at a time forward.  */
> -       vmovdqa (%rdi), %ymm5
> -       vmovdqa VEC_SIZE(%rdi), %ymm6
> -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
> -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
> -
> -       VPCMPEQ %ymm5, %ymm0, %ymm1
> -       VPCMPEQ %ymm6, %ymm0, %ymm2
> -       VPCMPEQ %ymm7, %ymm0, %ymm3
> -       VPCMPEQ %ymm8, %ymm0, %ymm4
> -
> -       VPCMPEQ %ymm5, %ymm9, %ymm5
> -       VPCMPEQ %ymm6, %ymm9, %ymm6
> -       VPCMPEQ %ymm7, %ymm9, %ymm7
> -       VPCMPEQ %ymm8, %ymm9, %ymm8
> -
> -       vpor    %ymm1, %ymm5, %ymm1
> -       vpor    %ymm2, %ymm6, %ymm2
> -       vpor    %ymm3, %ymm7, %ymm3
> -       vpor    %ymm4, %ymm8, %ymm4
> -
> -       vpor    %ymm1, %ymm2, %ymm5
> -       vpor    %ymm3, %ymm4, %ymm6
> +       jz      L(prep_loop_4x)
>
> -       vpor    %ymm5, %ymm6, %ymm5
> -
> -       vpmovmskb %ymm5, %eax
> -       testl   %eax, %eax
> -       jnz     L(4x_vec_end)
> -
> -       addq    $(VEC_SIZE * 4), %rdi
> -
> -       jmp     L(loop_4x_vec)
> +    tzcntl     %eax, %eax
> +    leaq       (VEC_SIZE * 3)(%rdi, %rax), %rax
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
> +       cmovne  %rdx, %rax
> +# endif
> +       VZEROUPPER
> +       ret
>
> -       .p2align 4
> +    .p2align 4
>  L(first_vec_x0):
> +    tzcntl     %eax, %eax
>         /* Found CHAR or the null byte.  */
> -       tzcntl  %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
>         addq    %rdi, %rax
> -# else
> -       xorl    %edx, %edx
> -       leaq    (%rdi, %rax), %rax
> -       cmp     (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
>         cmovne  %rdx, %rax
>  # endif
>         VZEROUPPER
>         ret
> -
> +
>         .p2align 4
>  L(first_vec_x1):
> -       tzcntl  %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> -       addq    $VEC_SIZE, %rax
> -       addq    %rdi, %rax
> -# else
> -       xorl    %edx, %edx
> -       leaq    VEC_SIZE(%rdi, %rax), %rax
> -       cmp     (%rax), %CHAR_REG
> +    tzcntl     %eax, %eax
> +    leaq       VEC_SIZE(%rdi, %rax), %rax
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
>         cmovne  %rdx, %rax
>  # endif
>         VZEROUPPER
> -       ret
> -
> -       .p2align 4
> +       ret
> +
> +    .p2align 4
>  L(first_vec_x2):
> -       tzcntl  %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> -       addq    $(VEC_SIZE * 2), %rax
> -       addq    %rdi, %rax
> -# else
> -       xorl    %edx, %edx
> +    tzcntl     %eax, %eax
> +       /* Found CHAR or the null byte.  */
>         leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> -       cmp     (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
>         cmovne  %rdx, %rax
>  # endif
>         VZEROUPPER
>         ret
> +
> +L(prep_loop_4x):
> +    /* Align data to 4 * VEC_SIZE.  */
> +       andq    $-(VEC_SIZE * 4), %rdi
>
>         .p2align 4
> -L(4x_vec_end):
> +L(loop_4x_vec):
> +       /* Compare 4 * VEC at a time forward.  */
> +       vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
> +       vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
> +       vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
> +       vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
> +
> +    /* Leaves only CHARS matching esi as 0.  */
> +    vpxor   %ymm5, %ymm0, %ymm1
> +    vpxor   %ymm6, %ymm0, %ymm2
> +    vpxor   %ymm7, %ymm0, %ymm3
> +    vpxor   %ymm8, %ymm0, %ymm4
> +
> +       VPMINU  %ymm1, %ymm5, %ymm1
> +       VPMINU  %ymm2, %ymm6, %ymm2
> +       VPMINU  %ymm3, %ymm7, %ymm3
> +       VPMINU  %ymm4, %ymm8, %ymm4
> +
> +       VPMINU  %ymm1, %ymm2, %ymm5
> +       VPMINU  %ymm3, %ymm4, %ymm6
> +
> +       VPMINU  %ymm5, %ymm6, %ymm5
> +
> +    VPCMPEQ %ymm5, %ymm9, %ymm5
> +       vpmovmskb %ymm5, %eax
> +
> +    addq       $(VEC_SIZE * 4), %rdi
> +       testl   %eax, %eax
> +    jz L(loop_4x_vec)
> +
> +    VPCMPEQ %ymm1, %ymm9, %ymm1
>         vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x0)
> -       vpmovmskb %ymm2, %eax
> +
> +    VPCMPEQ %ymm2, %ymm9, %ymm2
> +    vpmovmskb %ymm2, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x1)
> -       vpmovmskb %ymm3, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x2)
> +
> +    VPCMPEQ %ymm3, %ymm9, %ymm3
> +    VPCMPEQ %ymm4, %ymm9, %ymm4
> +       vpmovmskb %ymm3, %ecx
>         vpmovmskb %ymm4, %eax
> +    salq    $32, %rax
> +    orq     %rcx, %rax
> +       tzcntq  %rax, %rax
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
> +       cmovne  %rdx, %rax
> +# endif
> +       VZEROUPPER
> +       ret
> +
> +    /* Cold case for crossing page with first load.  */
> +       .p2align 4
> +L(cross_page_boundary):
> +    andq       $-VEC_SIZE, %rdi
> +       andl    $(VEC_SIZE - 1), %ecx
> +
> +       vmovdqa (%rdi), %ymm8
> +       VPCMPEQ %ymm8, %ymm0, %ymm1
> +       VPCMPEQ %ymm8, %ymm9, %ymm2
> +       vpor    %ymm1, %ymm2, %ymm1
> +       vpmovmskb %ymm1, %eax
> +       /* Remove the leading bits.  */
> +       sarxl   %ecx, %eax, %eax
>         testl   %eax, %eax
> -L(first_vec_x3):
> +       jz      L(aligned_more)
>         tzcntl  %eax, %eax
> -# ifdef USE_AS_STRCHRNUL
> -       addq    $(VEC_SIZE * 3), %rax
> +    addq       %rcx, %rdi
>         addq    %rdi, %rax
> -# else
> -       xorl    %edx, %edx
> -       leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
> -       cmp     (%rax), %CHAR_REG
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
>         cmovne  %rdx, %rax
>  # endif
>         VZEROUPPER
>         ret
>
>  END (STRCHR)
> -#endif
> +# endif
> diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
> index 583a152794..4dfbe3b58b 100644
> --- a/sysdeps/x86_64/multiarch/strchr.c
> +++ b/sysdeps/x86_64/multiarch/strchr.c
> @@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
>
>    if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
>        && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
> +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
>        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
>      return OPTIMIZE (avx2);
>
> --
> 2.29.2
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* [PATCH v3 1/2] x86: Refactor and improve performance of strchr-avx2.S
  2021-02-01  0:30 [PATCH v2 1/2] x86: Refactor and improve performance of strchr-avx2.S noah
  2021-02-01  0:30 ` [PATCH v2 2/2] x86: Add additional benchmarks for strchr noah
  2021-02-01 17:08 ` [PATCH v2 1/2] x86: Refactor and improve performance of strchr-avx2.S H.J. Lu
@ 2021-02-02  7:23 ` goldstein.w.n
  2 siblings, 0 replies; 5+ messages in thread
From: goldstein.w.n @ 2021-02-02  7:23 UTC (permalink / raw)
  To: libc-alpha; +Cc: carlos, goldstein.w.n, hjl.tools

From: noah <goldstein.w.n@gmail.com>

No bug. Just seemed the performance could be improved a bit. Observed
and expected behavior are unchanged. Optimized body of main
loop. Updated page cross logic and optimized accordingly. Made a few
minor instruction selection modifications. No regressions in test
suite. Both test-strchrnul and test-strchr passed.

Signed-off-by: noah <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/strchr-avx2.S | 235 ++++++++++++-------------
 sysdeps/x86_64/multiarch/strchr.c      |   1 +
 2 files changed, 118 insertions(+), 118 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index d416558d04..806ca66a9b 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -27,10 +27,12 @@
 # ifdef USE_AS_WCSCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMPEQ	vpcmpeqd
+#  define VPMINU	vpminud
 #  define CHAR_REG	esi
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMPEQ	vpcmpeqb
+#  define VPMINU	vpminub
 #  define CHAR_REG	sil
 # endif
 
@@ -39,20 +41,26 @@
 # endif
 
 # define VEC_SIZE 32
+# define PAGE_SIZE 4096
 
 	.section .text.avx,"ax",@progbits
 ENTRY (STRCHR)
 	movl	%edi, %ecx
-	/* Broadcast CHAR to YMM0.  */
+# ifndef USE_AS_STRCHRNUL
+	xorl	%edx, %edx
+# endif
+	
+	/* Broadcast CHAR to YMM0.	*/
 	vmovd	%esi, %xmm0
 	vpxor	%xmm9, %xmm9, %xmm9
 	VPBROADCAST %xmm0, %ymm0
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
-
-	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
+	
+	/* Check if we cross page boundary with one vector load.  */
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja L(cross_page_boundary)
+	
+	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
 	   null byte.  */
 	vmovdqu	(%rdi), %ymm8
 	VPCMPEQ %ymm8, %ymm0, %ymm1
@@ -60,50 +68,27 @@ ENTRY (STRCHR)
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-
-	jmp	L(more_4x_vec)
-
-	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	vmovdqu	(%rdi), %ymm8
-	VPCMPEQ %ymm8, %ymm0, %ymm1
-	VPCMPEQ %ymm8, %ymm9, %ymm2
-	vpor	%ymm1, %ymm2, %ymm1
-	vpmovmskb %ymm1, %eax
-	/* Remove the leading bytes.  */
-	sarl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	/* Found CHAR or the null byte.  */
+	jz	L(more_vecs)
 	tzcntl	%eax, %eax
-	addq	%rcx, %rax
-# ifdef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
 	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp		(%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
 	ret
 
 	.p2align 4
+L(more_vecs):	 
+	/* Align data for aligned loads in the loop.  */
+	andq	$-VEC_SIZE, %rdi
 L(aligned_more):
-	addq	$VEC_SIZE, %rdi
 
-L(more_4x_vec):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	vmovdqa	(%rdi), %ymm8
+	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.	*/
+	vmovdqa	VEC_SIZE(%rdi), %ymm8
+	addq	$VEC_SIZE, %rdi
 	VPCMPEQ %ymm8, %ymm0, %ymm1
 	VPCMPEQ %ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
@@ -125,7 +110,7 @@ L(more_4x_vec):
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x2)
+	jnz	L(first_vec_x2)	   
 
 	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
 	VPCMPEQ %ymm8, %ymm0, %ymm1
@@ -133,122 +118,136 @@ L(more_4x_vec):
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x3)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
-
-	.p2align 4
-L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	vmovdqa	(%rdi), %ymm5
-	vmovdqa	VEC_SIZE(%rdi), %ymm6
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
-
-	VPCMPEQ %ymm5, %ymm0, %ymm1
-	VPCMPEQ %ymm6, %ymm0, %ymm2
-	VPCMPEQ %ymm7, %ymm0, %ymm3
-	VPCMPEQ %ymm8, %ymm0, %ymm4
-
-	VPCMPEQ %ymm5, %ymm9, %ymm5
-	VPCMPEQ %ymm6, %ymm9, %ymm6
-	VPCMPEQ %ymm7, %ymm9, %ymm7
-	VPCMPEQ %ymm8, %ymm9, %ymm8
-
-	vpor	%ymm1, %ymm5, %ymm1
-	vpor	%ymm2, %ymm6, %ymm2
-	vpor	%ymm3, %ymm7, %ymm3
-	vpor	%ymm4, %ymm8, %ymm4
-
-	vpor	%ymm1, %ymm2, %ymm5
-	vpor	%ymm3, %ymm4, %ymm6
-
-	vpor	%ymm5, %ymm6, %ymm5
-
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
+	jz	L(prep_loop_4x)
 
-	jmp	L(loop_4x_vec)
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp		(%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
 
 	.p2align 4
 L(first_vec_x0):
-	/* Found CHAR or the null byte.  */
 	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
 	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp		(%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
 	ret
-
+	
 	.p2align 4
 L(first_vec_x1):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
-	addq	$VEC_SIZE, %rax
-	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
 	leaq	VEC_SIZE(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp		(%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
-	ret
-
+	ret	   
+	
 	.p2align 4
 L(first_vec_x2):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
-	addq	$(VEC_SIZE * 2), %rax
-	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
+	/* Found CHAR or the null byte.	 */
 	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp		(%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
 	ret
+	
+L(prep_loop_4x):
+	/* Align data to 4 * VEC_SIZE.	*/
+	andq	$-(VEC_SIZE * 4), %rdi
 
 	.p2align 4
-L(4x_vec_end):
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
+	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
+	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
+
+	/* Leaves only CHARS matching esi as 0.	 */
+	vpxor	%ymm5, %ymm0, %ymm1
+	vpxor	%ymm6, %ymm0, %ymm2
+	vpxor	%ymm7, %ymm0, %ymm3
+	vpxor	%ymm8, %ymm0, %ymm4
+
+	VPMINU	%ymm1, %ymm5, %ymm1
+	VPMINU	%ymm2, %ymm6, %ymm2
+	VPMINU	%ymm3, %ymm7, %ymm3
+	VPMINU	%ymm4, %ymm8, %ymm4
+
+	VPMINU	%ymm1, %ymm2, %ymm5
+	VPMINU	%ymm3, %ymm4, %ymm6
+
+	VPMINU	%ymm5, %ymm6, %ymm5
+
+	VPCMPEQ %ymm5, %ymm9, %ymm5
+	vpmovmskb %ymm5, %eax
+
+	addq	$(VEC_SIZE * 4), %rdi
+	testl	%eax, %eax
+	jz	L(loop_4x_vec)
+	
+	VPCMPEQ %ymm1, %ymm9, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x0)
+
+	VPCMPEQ %ymm2, %ymm9, %ymm2
 	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
+
+	VPCMPEQ %ymm3, %ymm9, %ymm3
+	VPCMPEQ %ymm4, %ymm9, %ymm4
+	vpmovmskb %ymm3, %ecx
 	vpmovmskb %ymm4, %eax
+	salq	$32, %rax
+	orq		%rcx, %rax
+	tzcntq	%rax, %rax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp		(%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	/* Cold case for crossing page with first load.	 */
+	.p2align 4
+L(cross_page_boundary):
+	andq	$-VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+
+	vmovdqa	(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Remove the leading bits.	 */
+	sarxl	%ecx, %eax, %eax
 	testl	%eax, %eax
-L(first_vec_x3):
+	jz	L(aligned_more)	   
 	tzcntl	%eax, %eax
-# ifdef USE_AS_STRCHRNUL
-	addq	$(VEC_SIZE * 3), %rax
+	addq	%rcx, %rdi
 	addq	%rdi, %rax
-# else
-	xorl	%edx, %edx
-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp		(%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER
 	ret
 
 END (STRCHR)
-#endif
+# endif
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
index 583a152794..4dfbe3b58b 100644
--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -37,6 +37,7 @@ IFUNC_SELECTOR (void)
 
   if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
       && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     return OPTIMIZE (avx2);
 
-- 
2.29.2


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2021-02-02  7:24 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-02-01  0:30 [PATCH v2 1/2] x86: Refactor and improve performance of strchr-avx2.S noah
2021-02-01  0:30 ` [PATCH v2 2/2] x86: Add additional benchmarks for strchr noah
2021-02-01 17:10   ` H.J. Lu
2021-02-01 17:08 ` [PATCH v2 1/2] x86: Refactor and improve performance of strchr-avx2.S H.J. Lu
2021-02-02  7:23 ` [PATCH v3 " goldstein.w.n

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).