* [PATCH v1 1/3] Bench: Expand bench-memchr.c
@ 2021-05-03 8:44 Noah Goldstein
2021-05-03 8:44 ` [PATCH v1 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
` (2 more replies)
0 siblings, 3 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 8:44 UTC (permalink / raw)
To: libc-alpha
No bug. This commit adds some additional cases for bench-memchr.c
including testing medium sizes and testing short length with both an
inbound match and out of bound match.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
benchtests/bench-memchr.c | 13 +++++++++++++
1 file changed, 13 insertions(+)
diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
index f5ced9d80d..5573f93312 100644
--- a/benchtests/bench-memchr.c
+++ b/benchtests/bench-memchr.c
@@ -135,12 +135,25 @@ test_main (void)
do_test (i, i, 256, 0);
#endif
}
+ for (i = 1; i < 8; ++i)
+ {
+ do_test (i, i << 5, 192, 23);
+ do_test (i, i << 5, 192, 0);
+ do_test (i, i << 5, 256, 23);
+ do_test (i, i << 5, 256, 0);
+ do_test (i, i << 5, 512, 23);
+ do_test (i, i << 5, 512, 0);
+ }
for (i = 1; i < 32; ++i)
{
do_test (0, i, i + 1, 23);
do_test (0, i, i + 1, 0);
do_test (i, i, i + 1, 23);
do_test (i, i, i + 1, 0);
+ do_test (0, i, i - 1, 23);
+ do_test (0, i, i - 1, 0);
+ do_test (i, i, i - 1, 23);
+ do_test (i, i, i - 1, 0);
#ifdef USE_AS_MEMRCHR
/* Also test the position close to the beginning for memrchr. */
do_test (0, 1, i + 1, 23);
--
2.29.2
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH v1 2/3] x86: Optimize memchr-avx2.S
2021-05-03 8:44 [PATCH v1 1/3] Bench: Expand bench-memchr.c Noah Goldstein
@ 2021-05-03 8:44 ` Noah Goldstein
2021-05-03 18:50 ` H.J. Lu
` (2 more replies)
2021-05-03 8:44 ` [PATCH v1 3/3] x86: Optimize memchr-evex.S Noah Goldstein
2021-05-03 17:17 ` [PATCH v1 1/3] Bench: Expand bench-memchr.c H.J. Lu
2 siblings, 3 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 8:44 UTC (permalink / raw)
To: libc-alpha
No bug. This commit optimizes memchr-avx2.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
asaving a few instructions the in loop return loop. test-memchr,
test-rawmemchr, and test-wmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/memchr-avx2.S | 446 +++++++++++++++----------
1 file changed, 262 insertions(+), 184 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 1fcb1c350f..8368fcd1e1 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -26,8 +26,22 @@
# ifdef USE_AS_WMEMCHR
# define VPCMPEQ vpcmpeqd
+# define VPBROADCAST vpbroadcastd
+# define CHAR_SIZE 4
# else
# define VPCMPEQ vpcmpeqb
+# define VPBROADCAST vpbroadcastb
+# define CHAR_SIZE 1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+# define ERAW_PTR_REG ecx
+# define RRAW_PTR_REG rcx
+# define ALGN_PTR_REG rdi
+# else
+# define ERAW_PTR_REG edi
+# define RRAW_PTR_REG rdi
+# define ALGN_PTR_REG rcx
# endif
# ifndef VZEROUPPER
@@ -39,303 +53,367 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+
.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY(MEMCHR)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
test %RDX_LP, %RDX_LP
jz L(null)
# endif
- movl %edi, %ecx
- /* Broadcast CHAR to YMM0. */
- vmovd %esi, %xmm0
# ifdef USE_AS_WMEMCHR
shl $2, %RDX_LP
- vpbroadcastd %xmm0, %ymm0
# else
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
- vpbroadcastb %xmm0, %ymm0
# endif
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ /* Broadcast CHAR to YMMMATCH. */
+ vmovd %esi, %xmm0
+ VPBROADCAST %xmm0, %ymm0
+ /* Check if we may cross page boundary with one
+ vector load. */
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
+ VPCMPEQ (%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
# ifndef USE_AS_RAWMEMCHR
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rdx
- jbe L(zero)
-# else
- jnz L(first_vec_x0)
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $VEC_SIZE, %rdx
+ jbe L(first_vec_x0)
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
- addq %rcx, %rdx
-
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
+ .p2align 5
+L(first_vec_x0):
+ /* Check if first match was before length. */
+ tzcntl %eax, %eax
+ xorl %ecx, %ecx
+ cmpl %eax, %edx
+ leaq (%rdi, %rax), %rax
+ cmovle %rcx, %rax
+ VZEROUPPER_RETURN
+L(null):
+ xorl %eax, %eax
+ ret
# endif
- jmp L(more_4x_vec)
-
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
+L(cross_page_boundary):
+ /* Save pointer before aligning as its original
+ value is necessary for computer return address if byte is
+ found or adjusting length if it is not and this is
+ memchr. */
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is
+ rcx for memchr and rdi for rawmemchr. */
+ orq $(VEC_SIZE - 1), %ALGN_PTR_REG
+ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Calculate length until end of page (length
+ checked for a match). */
+ leaq 1(%ALGN_PTR_REG), %rsi
+ subq %RRAW_PTR_REG, %rsi
+# endif
/* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
- tzcntl %eax, %eax
+ sarxl %ERAW_PTR_REG, %eax, %eax
# ifndef USE_AS_RAWMEMCHR
/* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ cmpq %rsi, %rdx
+ jbe L(first_vec_x0)
# endif
- addq %rdi, %rax
- addq %rcx, %rax
+ testl %eax, %eax
+ jz L(cross_page_continue)
+ tzcntl %eax, %eax
+ addq %RRAW_PTR_REG, %rax
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
- overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ incq %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- /* Check the end of data. */
- subq %rcx, %rdx
- jbe L(zero)
-# endif
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- addq $VEC_SIZE, %rdi
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 2 + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
-# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
-L(more_4x_vec):
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 3 + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ .p2align 4
+L(aligned_more):
+ /* Check the first 4 * VEC_SIZE. Only one
+ VEC_SIZE at a time since data is only aligned to
+ VEC_SIZE. */
+
+# ifndef USE_AS_RAWMEMCHR
+L(cross_page_continue):
+ /* Align data to VEC_SIZE - 1. */
+ xorl %ecx, %ecx
+ subl %edi, %ecx
+ orq $(VEC_SIZE - 1), %rdi
+ /* esi is for adjusting length to see if near the
+ end. */
+ leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# else
+ orq $(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+# endif
+ /* Load first VEC regardless. */
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. If near end handle specially.
+ */
+ subq %rsi, %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
# ifndef USE_AS_RAWMEMCHR
+ /* Check if at last VEC_SIZE * 4 length. */
subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
+ jbe L(last_4x_vec_or_less_cmpeq)
+ /* Align data to VEC_SIZE * 4 - 1 for the loop
+ and readjust length. */
+ incq %rdi
+ movl %edi, %ecx
+ orq $(VEC_SIZE * 4 - 1), %rdi
+ andl $(VEC_SIZE * 4 - 1), %ecx
addq %rcx, %rdx
+# else
+ /* Align data to VEC_SIZE * 4 - 1 for loop. */
+ incq %rdi
+ orq $(VEC_SIZE * 4 - 1), %rdi
# endif
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
-
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
vpor %ymm1, %ymm2, %ymm5
vpor %ymm3, %ymm4, %ymm6
vpor %ymm5, %ymm6, %ymm5
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
+ vpmovmskb %ymm5, %ecx
# ifdef USE_AS_RAWMEMCHR
- jmp L(loop_4x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
+ testl %ecx, %ecx
+ jz L(loop_4x_vec)
# else
- subq $(VEC_SIZE * 4), %rdx
- ja L(loop_4x_vec)
+ testl %ecx, %ecx
+ jnz L(loop_4x_vec_end)
-L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %edx
- jle L(last_2x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
+ subq $(VEC_SIZE * 4), %rdx
+ ja L(loop_4x_vec)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ /* Fall through into less than 4 remaining
+ vectors of length case. */
+ VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+ .p2align 4
+L(last_4x_vec_or_less):
+ /* Check if first VEC contained match. */
testl %eax, %eax
- jnz L(first_vec_x1)
+ jnz L(first_vec_x1_check)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
+ /* If remaining length > VEC_SIZE * 2. */
+ addl $(VEC_SIZE * 2), %edx
+ jg L(last_4x_vec)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+L(last_2x_vec):
+ /* If remaining length < VEC_SIZE. */
+ addl $VEC_SIZE, %edx
+ jle L(zero_end)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ /* Check VEC2 and compare any match with
+ remaining length. */
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
- jnz L(first_vec_x3_check)
- xorl %eax, %eax
+ tzcntl %eax, %eax
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ addq $(VEC_SIZE + 1), %rdi
+ addq %rdi, %rax
+L(zero_end):
VZEROUPPER_RETURN
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %edx
- VPCMPEQ (%rdi), %ymm0, %ymm1
+L(loop_4x_vec_end):
+# endif
+ /* rawmemchr will fall through into this if match
+ was found in loop. */
+
vpmovmskb %ymm1, %eax
testl %eax, %eax
+ jnz L(last_vec_x1_return)
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
-
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
- jnz L(first_vec_x1_check)
- xorl %eax, %eax
- VZEROUPPER_RETURN
+ jnz L(last_vec_x2_return)
- .p2align 4
-L(first_vec_x0_check):
- tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ vpmovmskb %ymm3, %eax
+ /* Combine VEC3 matches (eax) with VEC4 matches
+ (ecx). */
+ salq $32, %rcx
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 2 - 1), %rdi
+# else
+ subq $-(VEC_SIZE * 2 + 1), %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
.p2align 4
L(first_vec_x1_check):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $VEC_SIZE, %rax
+ /* Adjust length. */
+ subl $-(VEC_SIZE * 4), %edx
+ /* Check if match within remaining length. */
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ incq %rdi
addq %rdi, %rax
VZEROUPPER_RETURN
+ .p2align 4
+L(set_zero_end):
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
+# endif
.p2align 4
-L(first_vec_x2_check):
+L(last_vec_x1_return):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 2), %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 4 - 1), %rdi
+# else
+ incq %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 3), %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 3 - 1), %rdi
+# else
+ subq $-(VEC_SIZE + 1), %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
.p2align 4
-L(zero):
- xorl %eax, %eax
- jmp L(return_vzeroupper)
+L(last_4x_vec_or_less_cmpeq):
+ VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Check first VEC regardless. */
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+ /* If remaining length <= CHAR_PER_VEC * 2. */
+ addl $(VEC_SIZE * 2), %edx
+ jle L(last_2x_vec)
.p2align 4
-L(null):
- xorl %eax, %eax
- ret
-# endif
+L(last_4x_vec):
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2_return)
- .p2align 4
-L(first_vec_x0):
- tzcntl %eax, %eax
- addq %rdi, %rax
- VZEROUPPER_RETURN
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
- .p2align 4
-L(first_vec_x1):
- tzcntl %eax, %eax
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
- VZEROUPPER_RETURN
+ /* Create mask for possible matches within
+ remaining length. */
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
- .p2align 4
-L(first_vec_x2):
+ /* Test matches in data against length match. */
+ andl %ecx, %eax
+ jnz L(last_vec_x3)
+
+ /* if remaining length <= VEC_SIZE * 3 (Note this
+ is after remaining length was found to be > VEC_SIZE * 2.
+ */
+ subl $VEC_SIZE, %edx
+ jbe L(zero_end2)
+
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Shift remaining length mask for last VEC. */
+ shrq $32, %rcx
+ andl %ecx, %eax
+ jz L(zero_end2)
tzcntl %eax, %eax
- addq $(VEC_SIZE * 2), %rax
+ addq $(VEC_SIZE * 3 + 1), %rdi
addq %rdi, %rax
+L(zero_end2):
VZEROUPPER_RETURN
.p2align 4
-L(4x_vec_end):
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- vpmovmskb %ymm2, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- vpmovmskb %ymm4, %eax
- testl %eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
tzcntl %eax, %eax
- addq $(VEC_SIZE * 3), %rax
+ subq $-(VEC_SIZE * 2 + 1), %rdi
addq %rdi, %rax
VZEROUPPER_RETURN
+# endif
-END (MEMCHR)
+END(MEMCHR)
#endif
--
2.29.2
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH v1 3/3] x86: Optimize memchr-evex.S
2021-05-03 8:44 [PATCH v1 1/3] Bench: Expand bench-memchr.c Noah Goldstein
2021-05-03 8:44 ` [PATCH v1 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
@ 2021-05-03 8:44 ` Noah Goldstein
2021-05-03 18:58 ` H.J. Lu
2021-05-03 17:17 ` [PATCH v1 1/3] Bench: Expand bench-memchr.c H.J. Lu
2 siblings, 1 reply; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 8:44 UTC (permalink / raw)
To: libc-alpha
No bug. This commit optimizes memchr-evex.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
saving some ALU in the alignment process, and most importantly
increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
test-wmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
Tests where run on the following CPUs:
Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html
Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html
All times are the geometric mean of N=20. The unit of time is
seconds.
"Cur" refers to the current implementation
"New" refers to this patches implementation
Note: The numbers for size = [1, 32] are highly dependent on function
alignment. That being said the new implementation which uses cmovcc
instead of a branch (mostly for the reason of high variance with
different alignments) for the [1, 32] case is far more consistent and
performs about as well (and should only be a bigger improvement in
cases where the sizes / position are not 100% predictable).
For memchr-evex the numbers are a near universal improvement. The case
where the current implement as better is for size = 0 and for size =
[1, 32] with pos < size the two implementations are about the
same. For size = [1, 32] with pos > size, for medium range sizes, and
large size, however, the new implementation is faster.
Results For Tigerlake memchr-evex
size , algn , Pos , Cur T , New T , Win , Dif
2048 , 0 , , 32 5.58 , 5.22 , New , 0.36
256 , 1 , , 64 5.22 , 4.93 , New , 0.29
2048 , 0 , , 64 5.22 , 4.89 , New , 0.33
256 , 2 , , 64 5.14 , 4.81 , New , 0.33
2048 , 0 , , 128 6.3 , 5.67 , New , 0.63
256 , 3 , , 64 5.22 , 4.9 , New , 0.32
2048 , 0 , , 256 11.07 , 10.92 , New , 0.15
256 , 4 , , 64 5.16 , 4.86 , New , 0.3
2048 , 0 , , 512 15.66 , 14.81 , New , 0.85
256 , 5 , , 64 5.15 , 4.84 , New , 0.31
2048 , 0 , , 1024 25.7 , 23.02 , New , 2.68
256 , 6 , , 64 5.12 , 4.89 , New , 0.23
2048 , 0 , , 2048 42.34 , 37.71 , New , 4.63
256 , 7 , , 64 5.03 , 4.62 , New , 0.41
192 , 1 , , 32 4.96 , 4.28 , New , 0.68
256 , 1 , , 32 4.95 , 4.28 , New , 0.67
512 , 1 , , 32 4.94 , 4.29 , New , 0.65
192 , 2 , , 64 5.1 , 4.8 , New , 0.3
512 , 2 , , 64 5.12 , 4.72 , New , 0.4
192 , 3 , , 96 5.54 , 5.12 , New , 0.42
256 , 3 , , 96 5.52 , 5.15 , New , 0.37
512 , 3 , , 96 5.51 , 5.16 , New , 0.35
192 , 4 , , 128 6.1 , 5.53 , New , 0.57
256 , 4 , , 128 6.09 , 5.49 , New , 0.6
512 , 4 , , 128 6.08 , 5.48 , New , 0.6
192 , 5 , , 160 7.42 , 6.71 , New , 0.71
256 , 5 , , 160 6.86 , 6.71 , New , 0.15
512 , 5 , , 160 9.28 , 8.68 , New , 0.6
192 , 6 , , 192 7.94 , 7.47 , New , 0.47
256 , 6 , , 192 7.62 , 7.17 , New , 0.45
512 , 6 , , 192 9.2 , 9.16 , New , 0.04
192 , 7 , , 224 8.02 , 7.43 , New , 0.59
256 , 7 , , 224 8.34 , 7.85 , New , 0.49
512 , 7 , , 224 9.89 , 9.16 , New , 0.73
2 , 0 , , 1 3.0 , 3.0 , Eq , 0.0
2 , 1 , , 1 3.0 , 3.0 , Eq , 0.0
0 , 0 , , 1 3.01 , 3.6 , Cur , 0.59
0 , 1 , , 1 3.01 , 3.6 , Cur , 0.59
3 , 0 , , 2 3.0 , 3.0 , Eq , 0.0
3 , 2 , , 2 3.0 , 3.0 , Eq , 0.0
1 , 0 , , 2 3.6 , 3.0 , New , 0.6
1 , 2 , , 2 3.6 , 3.0 , New , 0.6
4 , 0 , , 3 3.01 , 3.01 , Eq , 0.0
4 , 3 , , 3 3.01 , 3.01 , Eq , 0.0
2 , 0 , , 3 3.62 , 3.02 , New , 0.6
2 , 3 , , 3 3.62 , 3.03 , New , 0.59
5 , 0 , , 4 3.02 , 3.03 , Cur , 0.01
5 , 4 , , 4 3.02 , 3.02 , Eq , 0.0
3 , 0 , , 4 3.63 , 3.02 , New , 0.61
3 , 4 , , 4 3.63 , 3.04 , New , 0.59
6 , 0 , , 5 3.05 , 3.04 , New , 0.01
6 , 5 , , 5 3.02 , 3.02 , Eq , 0.0
4 , 0 , , 5 3.63 , 3.02 , New , 0.61
4 , 5 , , 5 3.64 , 3.03 , New , 0.61
7 , 0 , , 6 3.03 , 3.03 , Eq , 0.0
7 , 6 , , 6 3.02 , 3.02 , Eq , 0.0
5 , 0 , , 6 3.64 , 3.01 , New , 0.63
5 , 6 , , 6 3.64 , 3.03 , New , 0.61
8 , 0 , , 7 3.03 , 3.04 , Cur , 0.01
8 , 7 , , 7 3.04 , 3.04 , Eq , 0.0
6 , 0 , , 7 3.67 , 3.04 , New , 0.63
6 , 7 , , 7 3.65 , 3.05 , New , 0.6
9 , 0 , , 8 3.05 , 3.05 , Eq , 0.0
7 , 0 , , 8 3.67 , 3.05 , New , 0.62
10 , 0 , , 9 3.06 , 3.06 , Eq , 0.0
10 , 1 , , 9 3.06 , 3.06 , Eq , 0.0
8 , 0 , , 9 3.67 , 3.06 , New , 0.61
8 , 1 , , 9 3.67 , 3.06 , New , 0.61
11 , 0 , , 10 3.06 , 3.06 , Eq , 0.0
11 , 2 , , 10 3.07 , 3.06 , New , 0.01
9 , 0 , , 10 3.67 , 3.05 , New , 0.62
9 , 2 , , 10 3.67 , 3.06 , New , 0.61
12 , 0 , , 11 3.06 , 3.06 , Eq , 0.0
12 , 3 , , 11 3.06 , 3.06 , Eq , 0.0
10 , 0 , , 11 3.67 , 3.06 , New , 0.61
10 , 3 , , 11 3.67 , 3.06 , New , 0.61
13 , 0 , , 12 3.06 , 3.07 , Cur , 0.01
13 , 4 , , 12 3.06 , 3.07 , Cur , 0.01
11 , 0 , , 12 3.67 , 3.11 , New , 0.56
11 , 4 , , 12 3.68 , 3.12 , New , 0.56
14 , 0 , , 13 3.07 , 3.1 , Cur , 0.03
14 , 5 , , 13 3.06 , 3.07 , Cur , 0.01
12 , 0 , , 13 3.67 , 3.07 , New , 0.6
12 , 5 , , 13 3.67 , 3.08 , New , 0.59
15 , 0 , , 14 3.06 , 3.06 , Eq , 0.0
15 , 6 , , 14 3.07 , 3.06 , New , 0.01
13 , 0 , , 14 3.67 , 3.06 , New , 0.61
13 , 6 , , 14 3.68 , 3.06 , New , 0.62
16 , 0 , , 15 3.06 , 3.06 , Eq , 0.0
16 , 7 , , 15 3.06 , 3.05 , New , 0.01
14 , 0 , , 15 3.68 , 3.06 , New , 0.62
14 , 7 , , 15 3.67 , 3.06 , New , 0.61
17 , 0 , , 16 3.07 , 3.06 , New , 0.01
15 , 0 , , 16 3.68 , 3.06 , New , 0.62
18 , 0 , , 17 3.06 , 3.06 , Eq , 0.0
18 , 1 , , 17 3.06 , 3.06 , Eq , 0.0
16 , 0 , , 17 3.67 , 3.06 , New , 0.61
16 , 1 , , 17 3.67 , 3.05 , New , 0.62
19 , 0 , , 18 3.07 , 3.06 , New , 0.01
19 , 2 , , 18 3.06 , 3.06 , Eq , 0.0
17 , 0 , , 18 3.68 , 3.08 , New , 0.6
17 , 2 , , 18 3.68 , 3.06 , New , 0.62
20 , 0 , , 19 3.06 , 3.06 , Eq , 0.0
20 , 3 , , 19 3.06 , 3.06 , Eq , 0.0
18 , 0 , , 19 3.68 , 3.06 , New , 0.62
18 , 3 , , 19 3.68 , 3.06 , New , 0.62
21 , 0 , , 20 3.06 , 3.06 , Eq , 0.0
21 , 4 , , 20 3.06 , 3.06 , Eq , 0.0
19 , 0 , , 20 3.67 , 3.06 , New , 0.61
19 , 4 , , 20 3.67 , 3.06 , New , 0.61
22 , 0 , , 21 3.06 , 3.06 , Eq , 0.0
22 , 5 , , 21 3.06 , 3.06 , Eq , 0.0
20 , 0 , , 21 3.67 , 3.05 , New , 0.62
20 , 5 , , 21 3.68 , 3.06 , New , 0.62
23 , 0 , , 22 3.07 , 3.06 , New , 0.01
23 , 6 , , 22 3.06 , 3.06 , Eq , 0.0
21 , 0 , , 22 3.68 , 3.07 , New , 0.61
21 , 6 , , 22 3.67 , 3.06 , New , 0.61
24 , 0 , , 23 3.19 , 3.06 , New , 0.13
24 , 7 , , 23 3.08 , 3.06 , New , 0.02
22 , 0 , , 23 3.69 , 3.06 , New , 0.63
22 , 7 , , 23 3.68 , 3.06 , New , 0.62
25 , 0 , , 24 3.07 , 3.06 , New , 0.01
23 , 0 , , 24 3.68 , 3.06 , New , 0.62
26 , 0 , , 25 3.06 , 3.05 , New , 0.01
26 , 1 , , 25 3.07 , 3.06 , New , 0.01
24 , 0 , , 25 3.67 , 3.05 , New , 0.62
24 , 1 , , 25 3.68 , 3.06 , New , 0.62
27 , 0 , , 26 3.12 , 3.06 , New , 0.06
27 , 2 , , 26 3.08 , 3.06 , New , 0.02
25 , 0 , , 26 3.69 , 3.06 , New , 0.63
25 , 2 , , 26 3.67 , 3.06 , New , 0.61
28 , 0 , , 27 3.06 , 3.06 , Eq , 0.0
28 , 3 , , 27 3.06 , 3.06 , Eq , 0.0
26 , 0 , , 27 3.67 , 3.06 , New , 0.61
26 , 3 , , 27 3.67 , 3.06 , New , 0.61
29 , 0 , , 28 3.06 , 3.06 , Eq , 0.0
29 , 4 , , 28 3.06 , 3.06 , Eq , 0.0
27 , 0 , , 28 3.68 , 3.05 , New , 0.63
27 , 4 , , 28 3.67 , 3.06 , New , 0.61
30 , 0 , , 29 3.06 , 3.06 , Eq , 0.0
30 , 5 , , 29 3.06 , 3.06 , Eq , 0.0
28 , 0 , , 29 3.67 , 3.06 , New , 0.61
28 , 5 , , 29 3.68 , 3.06 , New , 0.62
31 , 0 , , 30 3.06 , 3.06 , Eq , 0.0
31 , 6 , , 30 3.06 , 3.06 , Eq , 0.0
29 , 0 , , 30 3.68 , 3.06 , New , 0.62
29 , 6 , , 30 3.7 , 3.06 , New , 0.64
32 , 0 , , 31 3.17 , 3.06 , New , 0.11
32 , 7 , , 31 3.12 , 3.06 , New , 0.06
30 , 0 , , 31 3.68 , 3.06 , New , 0.62
30 , 7 , , 31 3.68 , 3.06 , New , 0.62
Results For Icelake memchr-evex
size , algn , Pos , Cur T , New T , Win , Dif
2048 , 0 , , 32 4.94 , 4.26 , New , 0.68
256 , 1 , , 64 4.5 , 4.13 , New , 0.37
2048 , 0 , , 64 4.19 , 3.9 , New , 0.29
256 , 2 , , 64 4.19 , 3.87 , New , 0.32
2048 , 0 , , 128 4.96 , 4.53 , New , 0.43
256 , 3 , , 64 4.07 , 3.86 , New , 0.21
2048 , 0 , , 256 8.77 , 8.61 , New , 0.16
256 , 4 , , 64 4.08 , 3.87 , New , 0.21
2048 , 0 , , 512 12.22 , 11.67 , New , 0.55
256 , 5 , , 64 4.12 , 3.83 , New , 0.29
2048 , 0 , , 1024 20.06 , 18.09 , New , 1.97
256 , 6 , , 64 4.2 , 3.95 , New , 0.25
2048 , 0 , , 2048 33.83 , 30.62 , New , 3.21
256 , 7 , , 64 4.3 , 4.04 , New , 0.26
192 , 1 , , 32 4.2 , 3.71 , New , 0.49
256 , 1 , , 32 4.24 , 3.76 , New , 0.48
512 , 1 , , 32 4.29 , 3.74 , New , 0.55
192 , 2 , , 64 4.42 , 4.0 , New , 0.42
512 , 2 , , 64 4.17 , 3.83 , New , 0.34
192 , 3 , , 96 4.44 , 4.26 , New , 0.18
256 , 3 , , 96 4.45 , 4.14 , New , 0.31
512 , 3 , , 96 4.42 , 4.15 , New , 0.27
192 , 4 , , 128 4.93 , 4.45 , New , 0.48
256 , 4 , , 128 4.93 , 4.47 , New , 0.46
512 , 4 , , 128 4.95 , 4.47 , New , 0.48
192 , 5 , , 160 5.95 , 5.44 , New , 0.51
256 , 5 , , 160 5.59 , 5.47 , New , 0.12
512 , 5 , , 160 7.59 , 7.34 , New , 0.25
192 , 6 , , 192 6.53 , 6.08 , New , 0.45
256 , 6 , , 192 6.2 , 5.88 , New , 0.32
512 , 6 , , 192 7.53 , 7.62 , Cur , 0.09
192 , 7 , , 224 6.62 , 6.12 , New , 0.5
256 , 7 , , 224 6.79 , 6.51 , New , 0.28
512 , 7 , , 224 8.12 , 7.61 , New , 0.51
2 , 0 , , 1 2.5 , 2.54 , Cur , 0.04
2 , 1 , , 1 2.56 , 2.55 , New , 0.01
0 , 0 , , 1 2.57 , 3.12 , Cur , 0.55
0 , 1 , , 1 2.59 , 3.14 , Cur , 0.55
3 , 0 , , 2 2.62 , 2.63 , Cur , 0.01
3 , 2 , , 2 2.66 , 2.67 , Cur , 0.01
1 , 0 , , 2 3.24 , 2.72 , New , 0.52
1 , 2 , , 2 3.28 , 2.75 , New , 0.53
4 , 0 , , 3 2.78 , 2.8 , Cur , 0.02
4 , 3 , , 3 2.8 , 2.82 , Cur , 0.02
2 , 0 , , 3 3.38 , 2.86 , New , 0.52
2 , 3 , , 3 3.41 , 2.89 , New , 0.52
5 , 0 , , 4 2.88 , 2.91 , Cur , 0.03
5 , 4 , , 4 2.88 , 2.92 , Cur , 0.04
3 , 0 , , 4 3.48 , 2.93 , New , 0.55
3 , 4 , , 4 3.47 , 2.93 , New , 0.54
6 , 0 , , 5 2.95 , 2.94 , New , 0.01
6 , 5 , , 5 2.91 , 2.92 , Cur , 0.01
4 , 0 , , 5 3.47 , 2.9 , New , 0.57
4 , 5 , , 5 3.43 , 2.91 , New , 0.52
7 , 0 , , 6 2.87 , 2.9 , Cur , 0.03
7 , 6 , , 6 2.87 , 2.89 , Cur , 0.02
5 , 0 , , 6 3.44 , 2.88 , New , 0.56
5 , 6 , , 6 3.41 , 2.87 , New , 0.54
8 , 0 , , 7 2.86 , 2.87 , Cur , 0.01
8 , 7 , , 7 2.86 , 2.87 , Cur , 0.01
6 , 0 , , 7 3.43 , 2.87 , New , 0.56
6 , 7 , , 7 3.44 , 2.87 , New , 0.57
9 , 0 , , 8 2.86 , 2.88 , Cur , 0.02
7 , 0 , , 8 3.41 , 2.89 , New , 0.52
10 , 0 , , 9 2.83 , 2.87 , Cur , 0.04
10 , 1 , , 9 2.82 , 2.87 , Cur , 0.05
8 , 0 , , 9 3.4 , 2.89 , New , 0.51
8 , 1 , , 9 3.41 , 2.87 , New , 0.54
11 , 0 , , 10 2.83 , 2.88 , Cur , 0.05
11 , 2 , , 10 2.84 , 2.88 , Cur , 0.04
9 , 0 , , 10 3.41 , 2.87 , New , 0.54
9 , 2 , , 10 3.41 , 2.88 , New , 0.53
12 , 0 , , 11 2.83 , 2.89 , Cur , 0.06
12 , 3 , , 11 2.85 , 2.87 , Cur , 0.02
10 , 0 , , 11 3.41 , 2.87 , New , 0.54
10 , 3 , , 11 3.42 , 2.88 , New , 0.54
13 , 0 , , 12 2.86 , 2.87 , Cur , 0.01
13 , 4 , , 12 2.84 , 2.88 , Cur , 0.04
11 , 0 , , 12 3.43 , 2.87 , New , 0.56
11 , 4 , , 12 3.49 , 2.87 , New , 0.62
14 , 0 , , 13 2.85 , 2.86 , Cur , 0.01
14 , 5 , , 13 2.85 , 2.86 , Cur , 0.01
12 , 0 , , 13 3.41 , 2.86 , New , 0.55
12 , 5 , , 13 3.44 , 2.85 , New , 0.59
15 , 0 , , 14 2.83 , 2.87 , Cur , 0.04
15 , 6 , , 14 2.82 , 2.86 , Cur , 0.04
13 , 0 , , 14 3.41 , 2.86 , New , 0.55
13 , 6 , , 14 3.4 , 2.86 , New , 0.54
16 , 0 , , 15 2.84 , 2.86 , Cur , 0.02
16 , 7 , , 15 2.83 , 2.85 , Cur , 0.02
14 , 0 , , 15 3.41 , 2.85 , New , 0.56
14 , 7 , , 15 3.39 , 2.87 , New , 0.52
17 , 0 , , 16 2.83 , 2.87 , Cur , 0.04
15 , 0 , , 16 3.4 , 2.85 , New , 0.55
18 , 0 , , 17 2.83 , 2.86 , Cur , 0.03
18 , 1 , , 17 2.85 , 2.84 , New , 0.01
16 , 0 , , 17 3.41 , 2.85 , New , 0.56
16 , 1 , , 17 3.4 , 2.86 , New , 0.54
19 , 0 , , 18 2.8 , 2.84 , Cur , 0.04
19 , 2 , , 18 2.82 , 2.83 , Cur , 0.01
17 , 0 , , 18 3.39 , 2.86 , New , 0.53
17 , 2 , , 18 3.39 , 2.84 , New , 0.55
20 , 0 , , 19 2.85 , 2.87 , Cur , 0.02
20 , 3 , , 19 2.88 , 2.87 , New , 0.01
18 , 0 , , 19 3.38 , 2.85 , New , 0.53
18 , 3 , , 19 3.4 , 2.85 , New , 0.55
21 , 0 , , 20 2.83 , 2.85 , Cur , 0.02
21 , 4 , , 20 2.88 , 2.85 , New , 0.03
19 , 0 , , 20 3.39 , 2.84 , New , 0.55
19 , 4 , , 20 3.39 , 2.96 , New , 0.43
22 , 0 , , 21 2.84 , 2.9 , Cur , 0.06
22 , 5 , , 21 2.81 , 2.84 , Cur , 0.03
20 , 0 , , 21 3.41 , 2.81 , New , 0.6
20 , 5 , , 21 3.38 , 2.83 , New , 0.55
23 , 0 , , 22 2.8 , 2.82 , Cur , 0.02
23 , 6 , , 22 2.81 , 2.83 , Cur , 0.02
21 , 0 , , 22 3.35 , 2.81 , New , 0.54
21 , 6 , , 22 3.34 , 2.81 , New , 0.53
24 , 0 , , 23 2.77 , 2.84 , Cur , 0.07
24 , 7 , , 23 2.78 , 2.8 , Cur , 0.02
22 , 0 , , 23 3.34 , 2.79 , New , 0.55
22 , 7 , , 23 3.32 , 2.79 , New , 0.53
25 , 0 , , 24 2.77 , 2.8 , Cur , 0.03
23 , 0 , , 24 3.29 , 2.79 , New , 0.5
26 , 0 , , 25 2.73 , 2.78 , Cur , 0.05
26 , 1 , , 25 2.75 , 2.79 , Cur , 0.04
24 , 0 , , 25 3.27 , 2.79 , New , 0.48
24 , 1 , , 25 3.27 , 2.77 , New , 0.5
27 , 0 , , 26 2.72 , 2.78 , Cur , 0.06
27 , 2 , , 26 2.75 , 2.76 , Cur , 0.01
25 , 0 , , 26 3.29 , 2.73 , New , 0.56
25 , 2 , , 26 3.3 , 2.76 , New , 0.54
28 , 0 , , 27 2.75 , 2.79 , Cur , 0.04
28 , 3 , , 27 2.77 , 2.77 , Eq , 0.0
26 , 0 , , 27 3.28 , 2.78 , New , 0.5
26 , 3 , , 27 3.29 , 2.78 , New , 0.51
29 , 0 , , 28 2.74 , 2.76 , Cur , 0.02
29 , 4 , , 28 2.74 , 2.77 , Cur , 0.03
27 , 0 , , 28 3.3 , 2.76 , New , 0.54
27 , 4 , , 28 3.3 , 2.74 , New , 0.56
30 , 0 , , 29 2.72 , 2.76 , Cur , 0.04
30 , 5 , , 29 2.74 , 2.75 , Cur , 0.01
28 , 0 , , 29 3.25 , 2.73 , New , 0.52
28 , 5 , , 29 3.3 , 2.73 , New , 0.57
31 , 0 , , 30 2.73 , 2.77 , Cur , 0.04
31 , 6 , , 30 2.74 , 2.76 , Cur , 0.02
29 , 0 , , 30 3.25 , 2.73 , New , 0.52
29 , 6 , , 30 3.26 , 2.74 , New , 0.52
32 , 0 , , 31 2.73 , 2.74 , Cur , 0.01
32 , 7 , , 31 2.73 , 2.75 , Cur , 0.02
30 , 0 , , 31 3.24 , 2.72 , New , 0.52
30 , 7 , , 31 3.24 , 2.72 , New , 0.52
For memchr-avx2 the improvements are more modest though again near
universal. The improvement is most significant for medium sizes and
small sizes with pos > size. For small sizes with pos < size and large
sizes the two implementations perform roughly the same for large
sizes.
Results For Tigerlake memchr-avx2
size , algn , Pos , Cur T , New T , Win , Dif
2048 , 0 , , 32 6.15 , 6.27 , Cur , 0.12
256 , 1 , , 64 6.21 , 6.03 , New , 0.18
2048 , 0 , , 64 6.07 , 5.95 , New , 0.12
256 , 2 , , 64 6.01 , 5.8 , New , 0.21
2048 , 0 , , 128 7.05 , 6.55 , New , 0.5
256 , 3 , , 64 6.14 , 5.83 , New , 0.31
2048 , 0 , , 256 11.78 , 11.78 , Eq , 0.0
256 , 4 , , 64 6.1 , 5.85 , New , 0.25
2048 , 0 , , 512 16.32 , 15.96 , New , 0.36
256 , 5 , , 64 6.1 , 5.77 , New , 0.33
2048 , 0 , , 1024 25.38 , 25.18 , New , 0.2
256 , 6 , , 64 6.08 , 5.88 , New , 0.2
2048 , 0 , , 2048 38.56 , 38.32 , New , 0.24
256 , 7 , , 64 5.93 , 5.68 , New , 0.25
192 , 1 , , 32 5.49 , 5.3 , New , 0.19
256 , 1 , , 32 5.5 , 5.28 , New , 0.22
512 , 1 , , 32 5.48 , 5.32 , New , 0.16
192 , 2 , , 64 6.1 , 5.73 , New , 0.37
512 , 2 , , 64 5.88 , 5.72 , New , 0.16
192 , 3 , , 96 6.31 , 5.93 , New , 0.38
256 , 3 , , 96 6.32 , 5.93 , New , 0.39
512 , 3 , , 96 6.2 , 5.94 , New , 0.26
192 , 4 , , 128 6.65 , 6.4 , New , 0.25
256 , 4 , , 128 6.6 , 6.37 , New , 0.23
512 , 4 , , 128 6.74 , 6.33 , New , 0.41
192 , 5 , , 160 7.78 , 7.4 , New , 0.38
256 , 5 , , 160 7.18 , 7.4 , Cur , 0.22
512 , 5 , , 160 9.81 , 9.44 , New , 0.37
192 , 6 , , 192 9.12 , 7.77 , New , 1.35
256 , 6 , , 192 7.97 , 7.66 , New , 0.31
512 , 6 , , 192 10.14 , 9.95 , New , 0.19
192 , 7 , , 224 8.96 , 7.78 , New , 1.18
256 , 7 , , 224 8.52 , 8.23 , New , 0.29
512 , 7 , , 224 10.33 , 9.98 , New , 0.35
2 , 0 , , 1 3.61 , 3.6 , New , 0.01
2 , 1 , , 1 3.6 , 3.6 , Eq , 0.0
0 , 0 , , 1 3.02 , 3.0 , New , 0.02
0 , 1 , , 1 3.0 , 3.0 , Eq , 0.0
3 , 0 , , 2 3.6 , 3.6 , Eq , 0.0
3 , 2 , , 2 3.61 , 3.6 , New , 0.01
1 , 0 , , 2 4.82 , 3.6 , New , 1.22
1 , 2 , , 2 4.81 , 3.6 , New , 1.21
4 , 0 , , 3 3.61 , 3.61 , Eq , 0.0
4 , 3 , , 3 3.62 , 3.61 , New , 0.01
2 , 0 , , 3 4.82 , 3.62 , New , 1.2
2 , 3 , , 3 4.83 , 3.63 , New , 1.2
5 , 0 , , 4 3.63 , 3.64 , Cur , 0.01
5 , 4 , , 4 3.63 , 3.62 , New , 0.01
3 , 0 , , 4 4.84 , 3.62 , New , 1.22
3 , 4 , , 4 4.84 , 3.64 , New , 1.2
6 , 0 , , 5 3.66 , 3.64 , New , 0.02
6 , 5 , , 5 3.65 , 3.62 , New , 0.03
4 , 0 , , 5 4.83 , 3.63 , New , 1.2
4 , 5 , , 5 4.85 , 3.64 , New , 1.21
7 , 0 , , 6 3.76 , 3.79 , Cur , 0.03
7 , 6 , , 6 3.76 , 3.72 , New , 0.04
5 , 0 , , 6 4.84 , 3.62 , New , 1.22
5 , 6 , , 6 4.85 , 3.64 , New , 1.21
8 , 0 , , 7 3.64 , 3.65 , Cur , 0.01
8 , 7 , , 7 3.65 , 3.65 , Eq , 0.0
6 , 0 , , 7 4.88 , 3.64 , New , 1.24
6 , 7 , , 7 4.87 , 3.65 , New , 1.22
9 , 0 , , 8 3.66 , 3.66 , Eq , 0.0
7 , 0 , , 8 4.89 , 3.66 , New , 1.23
10 , 0 , , 9 3.67 , 3.67 , Eq , 0.0
10 , 1 , , 9 3.67 , 3.67 , Eq , 0.0
8 , 0 , , 9 4.9 , 3.67 , New , 1.23
8 , 1 , , 9 4.9 , 3.67 , New , 1.23
11 , 0 , , 10 3.68 , 3.67 , New , 0.01
11 , 2 , , 10 3.69 , 3.67 , New , 0.02
9 , 0 , , 10 4.9 , 3.67 , New , 1.23
9 , 2 , , 10 4.9 , 3.67 , New , 1.23
12 , 0 , , 11 3.71 , 3.68 , New , 0.03
12 , 3 , , 11 3.71 , 3.67 , New , 0.04
10 , 0 , , 11 4.9 , 3.67 , New , 1.23
10 , 3 , , 11 4.9 , 3.67 , New , 1.23
13 , 0 , , 12 4.24 , 4.23 , New , 0.01
13 , 4 , , 12 4.23 , 4.23 , Eq , 0.0
11 , 0 , , 12 4.9 , 3.7 , New , 1.2
11 , 4 , , 12 4.9 , 3.73 , New , 1.17
14 , 0 , , 13 3.99 , 4.01 , Cur , 0.02
14 , 5 , , 13 3.98 , 3.98 , Eq , 0.0
12 , 0 , , 13 4.9 , 3.69 , New , 1.21
12 , 5 , , 13 4.9 , 3.69 , New , 1.21
15 , 0 , , 14 3.99 , 3.97 , New , 0.02
15 , 6 , , 14 4.0 , 4.0 , Eq , 0.0
13 , 0 , , 14 4.9 , 3.67 , New , 1.23
13 , 6 , , 14 4.9 , 3.67 , New , 1.23
16 , 0 , , 15 3.99 , 4.02 , Cur , 0.03
16 , 7 , , 15 4.01 , 3.96 , New , 0.05
14 , 0 , , 15 4.93 , 3.67 , New , 1.26
14 , 7 , , 15 4.92 , 3.67 , New , 1.25
17 , 0 , , 16 4.04 , 3.99 , New , 0.05
15 , 0 , , 16 5.42 , 4.22 , New , 1.2
18 , 0 , , 17 4.01 , 3.97 , New , 0.04
18 , 1 , , 17 3.99 , 3.98 , New , 0.01
16 , 0 , , 17 5.22 , 3.98 , New , 1.24
16 , 1 , , 17 5.19 , 3.98 , New , 1.21
19 , 0 , , 18 4.0 , 3.99 , New , 0.01
19 , 2 , , 18 4.03 , 3.97 , New , 0.06
17 , 0 , , 18 5.18 , 3.99 , New , 1.19
17 , 2 , , 18 5.18 , 3.98 , New , 1.2
20 , 0 , , 19 4.02 , 3.98 , New , 0.04
20 , 3 , , 19 4.0 , 3.98 , New , 0.02
18 , 0 , , 19 5.19 , 3.97 , New , 1.22
18 , 3 , , 19 5.21 , 3.98 , New , 1.23
21 , 0 , , 20 3.98 , 4.0 , Cur , 0.02
21 , 4 , , 20 4.0 , 4.0 , Eq , 0.0
19 , 0 , , 20 5.19 , 3.99 , New , 1.2
19 , 4 , , 20 5.17 , 3.99 , New , 1.18
22 , 0 , , 21 4.03 , 3.98 , New , 0.05
22 , 5 , , 21 4.01 , 3.95 , New , 0.06
20 , 0 , , 21 5.19 , 4.0 , New , 1.19
20 , 5 , , 21 5.21 , 3.99 , New , 1.22
23 , 0 , , 22 4.06 , 3.97 , New , 0.09
23 , 6 , , 22 4.02 , 3.98 , New , 0.04
21 , 0 , , 22 5.2 , 4.02 , New , 1.18
21 , 6 , , 22 5.22 , 4.0 , New , 1.22
24 , 0 , , 23 4.15 , 3.98 , New , 0.17
24 , 7 , , 23 4.0 , 4.01 , Cur , 0.01
22 , 0 , , 23 5.28 , 4.0 , New , 1.28
22 , 7 , , 23 5.22 , 3.99 , New , 1.23
25 , 0 , , 24 4.1 , 4.04 , New , 0.06
23 , 0 , , 24 5.23 , 4.04 , New , 1.19
26 , 0 , , 25 4.1 , 4.06 , New , 0.04
26 , 1 , , 25 4.07 , 3.99 , New , 0.08
24 , 0 , , 25 5.26 , 4.02 , New , 1.24
24 , 1 , , 25 5.21 , 4.0 , New , 1.21
27 , 0 , , 26 4.17 , 4.03 , New , 0.14
27 , 2 , , 26 4.09 , 4.03 , New , 0.06
25 , 0 , , 26 5.29 , 4.1 , New , 1.19
25 , 2 , , 26 5.25 , 4.0 , New , 1.25
28 , 0 , , 27 4.06 , 4.1 , Cur , 0.04
28 , 3 , , 27 4.09 , 4.04 , New , 0.05
26 , 0 , , 27 5.26 , 4.04 , New , 1.22
26 , 3 , , 27 5.28 , 4.01 , New , 1.27
29 , 0 , , 28 4.07 , 4.02 , New , 0.05
29 , 4 , , 28 4.07 , 4.05 , New , 0.02
27 , 0 , , 28 5.25 , 4.02 , New , 1.23
27 , 4 , , 28 5.25 , 4.03 , New , 1.22
30 , 0 , , 29 4.14 , 4.06 , New , 0.08
30 , 5 , , 29 4.08 , 4.04 , New , 0.04
28 , 0 , , 29 5.26 , 4.07 , New , 1.19
28 , 5 , , 29 5.28 , 4.04 , New , 1.24
31 , 0 , , 30 4.09 , 4.08 , New , 0.01
31 , 6 , , 30 4.1 , 4.08 , New , 0.02
29 , 0 , , 30 5.28 , 4.05 , New , 1.23
29 , 6 , , 30 5.24 , 4.07 , New , 1.17
32 , 0 , , 31 4.1 , 4.13 , Cur , 0.03
32 , 7 , , 31 4.16 , 4.09 , New , 0.07
30 , 0 , , 31 5.31 , 4.09 , New , 1.22
30 , 7 , , 31 5.28 , 4.08 , New , 1.2
Results For Icelake memchr-avx2
size , algn , Pos , Cur T , New T , Win , Dif
2048 , 0 , , 32 5.74 , 5.08 , New , 0.66
256 , 1 , , 64 5.16 , 4.93 , New , 0.23
2048 , 0 , , 64 4.86 , 4.69 , New , 0.17
256 , 2 , , 64 4.78 , 4.7 , New , 0.08
2048 , 0 , , 128 5.64 , 5.0 , New , 0.64
256 , 3 , , 64 4.64 , 4.59 , New , 0.05
2048 , 0 , , 256 9.07 , 9.17 , Cur , 0.1
256 , 4 , , 64 4.7 , 4.6 , New , 0.1
2048 , 0 , , 512 12.56 , 12.33 , New , 0.23
256 , 5 , , 64 4.72 , 4.61 , New , 0.11
2048 , 0 , , 1024 19.36 , 19.49 , Cur , 0.13
256 , 6 , , 64 4.82 , 4.69 , New , 0.13
2048 , 0 , , 2048 29.99 , 30.53 , Cur , 0.54
256 , 7 , , 64 4.9 , 4.85 , New , 0.05
192 , 1 , , 32 4.89 , 4.45 , New , 0.44
256 , 1 , , 32 4.93 , 4.44 , New , 0.49
512 , 1 , , 32 4.97 , 4.45 , New , 0.52
192 , 2 , , 64 5.04 , 4.65 , New , 0.39
512 , 2 , , 64 4.75 , 4.66 , New , 0.09
192 , 3 , , 96 5.14 , 4.66 , New , 0.48
256 , 3 , , 96 5.12 , 4.66 , New , 0.46
512 , 3 , , 96 5.13 , 4.62 , New , 0.51
192 , 4 , , 128 5.65 , 4.95 , New , 0.7
256 , 4 , , 128 5.63 , 4.95 , New , 0.68
512 , 4 , , 128 5.68 , 4.96 , New , 0.72
192 , 5 , , 160 6.1 , 5.84 , New , 0.26
256 , 5 , , 160 5.58 , 5.84 , Cur , 0.26
512 , 5 , , 160 7.95 , 7.74 , New , 0.21
192 , 6 , , 192 7.07 , 6.23 , New , 0.84
256 , 6 , , 192 6.34 , 6.09 , New , 0.25
512 , 6 , , 192 8.17 , 8.13 , New , 0.04
192 , 7 , , 224 7.06 , 6.23 , New , 0.83
256 , 7 , , 224 6.76 , 6.65 , New , 0.11
512 , 7 , , 224 8.29 , 8.08 , New , 0.21
2 , 0 , , 1 3.0 , 3.04 , Cur , 0.04
2 , 1 , , 1 3.06 , 3.07 , Cur , 0.01
0 , 0 , , 1 2.57 , 2.59 , Cur , 0.02
0 , 1 , , 1 2.6 , 2.61 , Cur , 0.01
3 , 0 , , 2 3.15 , 3.17 , Cur , 0.02
3 , 2 , , 2 3.19 , 3.21 , Cur , 0.02
1 , 0 , , 2 4.32 , 3.25 , New , 1.07
1 , 2 , , 2 4.36 , 3.31 , New , 1.05
4 , 0 , , 3 3.5 , 3.52 , Cur , 0.02
4 , 3 , , 3 3.52 , 3.54 , Cur , 0.02
2 , 0 , , 3 4.51 , 3.43 , New , 1.08
2 , 3 , , 3 4.56 , 3.47 , New , 1.09
5 , 0 , , 4 3.61 , 3.65 , Cur , 0.04
5 , 4 , , 4 3.63 , 3.67 , Cur , 0.04
3 , 0 , , 4 4.64 , 3.51 , New , 1.13
3 , 4 , , 4 4.7 , 3.51 , New , 1.19
6 , 0 , , 5 3.66 , 3.68 , Cur , 0.02
6 , 5 , , 5 3.69 , 3.65 , New , 0.04
4 , 0 , , 5 4.7 , 3.49 , New , 1.21
4 , 5 , , 5 4.58 , 3.48 , New , 1.1
7 , 0 , , 6 3.6 , 3.65 , Cur , 0.05
7 , 6 , , 6 3.59 , 3.64 , Cur , 0.05
5 , 0 , , 6 4.74 , 3.65 , New , 1.09
5 , 6 , , 6 4.73 , 3.64 , New , 1.09
8 , 0 , , 7 3.6 , 3.61 , Cur , 0.01
8 , 7 , , 7 3.6 , 3.61 , Cur , 0.01
6 , 0 , , 7 4.73 , 3.6 , New , 1.13
6 , 7 , , 7 4.73 , 3.62 , New , 1.11
9 , 0 , , 8 3.59 , 3.62 , Cur , 0.03
7 , 0 , , 8 4.72 , 3.64 , New , 1.08
10 , 0 , , 9 3.57 , 3.62 , Cur , 0.05
10 , 1 , , 9 3.56 , 3.61 , Cur , 0.05
8 , 0 , , 9 4.69 , 3.63 , New , 1.06
8 , 1 , , 9 4.71 , 3.61 , New , 1.1
11 , 0 , , 10 3.58 , 3.62 , Cur , 0.04
11 , 2 , , 10 3.59 , 3.63 , Cur , 0.04
9 , 0 , , 10 4.72 , 3.61 , New , 1.11
9 , 2 , , 10 4.7 , 3.61 , New , 1.09
12 , 0 , , 11 3.58 , 3.63 , Cur , 0.05
12 , 3 , , 11 3.58 , 3.62 , Cur , 0.04
10 , 0 , , 11 4.7 , 3.6 , New , 1.1
10 , 3 , , 11 4.73 , 3.64 , New , 1.09
13 , 0 , , 12 3.6 , 3.6 , Eq , 0.0
13 , 4 , , 12 3.57 , 3.62 , Cur , 0.05
11 , 0 , , 12 4.73 , 3.62 , New , 1.11
11 , 4 , , 12 4.79 , 3.61 , New , 1.18
14 , 0 , , 13 3.61 , 3.62 , Cur , 0.01
14 , 5 , , 13 3.59 , 3.59 , Eq , 0.0
12 , 0 , , 13 4.7 , 3.61 , New , 1.09
12 , 5 , , 13 4.75 , 3.58 , New , 1.17
15 , 0 , , 14 3.58 , 3.62 , Cur , 0.04
15 , 6 , , 14 3.59 , 3.62 , Cur , 0.03
13 , 0 , , 14 4.68 , 3.6 , New , 1.08
13 , 6 , , 14 4.68 , 3.63 , New , 1.05
16 , 0 , , 15 3.57 , 3.6 , Cur , 0.03
16 , 7 , , 15 3.55 , 3.59 , Cur , 0.04
14 , 0 , , 15 4.69 , 3.61 , New , 1.08
14 , 7 , , 15 4.69 , 3.61 , New , 1.08
17 , 0 , , 16 3.56 , 3.61 , Cur , 0.05
15 , 0 , , 16 4.71 , 3.58 , New , 1.13
18 , 0 , , 17 3.57 , 3.65 , Cur , 0.08
18 , 1 , , 17 3.58 , 3.59 , Cur , 0.01
16 , 0 , , 17 4.7 , 3.58 , New , 1.12
16 , 1 , , 17 4.68 , 3.59 , New , 1.09
19 , 0 , , 18 3.51 , 3.58 , Cur , 0.07
19 , 2 , , 18 3.55 , 3.58 , Cur , 0.03
17 , 0 , , 18 4.69 , 3.61 , New , 1.08
17 , 2 , , 18 4.68 , 3.61 , New , 1.07
20 , 0 , , 19 3.57 , 3.6 , Cur , 0.03
20 , 3 , , 19 3.59 , 3.59 , Eq , 0.0
18 , 0 , , 19 4.68 , 3.59 , New , 1.09
18 , 3 , , 19 4.67 , 3.57 , New , 1.1
21 , 0 , , 20 3.61 , 3.58 , New , 0.03
21 , 4 , , 20 3.62 , 3.6 , New , 0.02
19 , 0 , , 20 4.74 , 3.57 , New , 1.17
19 , 4 , , 20 4.69 , 3.7 , New , 0.99
22 , 0 , , 21 3.57 , 3.64 , Cur , 0.07
22 , 5 , , 21 3.55 , 3.6 , Cur , 0.05
20 , 0 , , 21 4.72 , 3.55 , New , 1.17
20 , 5 , , 21 4.66 , 3.55 , New , 1.11
23 , 0 , , 22 3.56 , 3.56 , Eq , 0.0
23 , 6 , , 22 3.54 , 3.56 , Cur , 0.02
21 , 0 , , 22 4.65 , 3.53 , New , 1.12
21 , 6 , , 22 4.62 , 3.56 , New , 1.06
24 , 0 , , 23 3.5 , 3.54 , Cur , 0.04
24 , 7 , , 23 3.52 , 3.53 , Cur , 0.01
22 , 0 , , 23 4.61 , 3.51 , New , 1.1
22 , 7 , , 23 4.6 , 3.51 , New , 1.09
25 , 0 , , 24 3.5 , 3.53 , Cur , 0.03
23 , 0 , , 24 4.54 , 3.5 , New , 1.04
26 , 0 , , 25 3.47 , 3.49 , Cur , 0.02
26 , 1 , , 25 3.46 , 3.51 , Cur , 0.05
24 , 0 , , 25 4.53 , 3.51 , New , 1.02
24 , 1 , , 25 4.51 , 3.51 , New , 1.0
27 , 0 , , 26 3.44 , 3.51 , Cur , 0.07
27 , 2 , , 26 3.51 , 3.52 , Cur , 0.01
25 , 0 , , 26 4.56 , 3.46 , New , 1.1
25 , 2 , , 26 4.55 , 3.47 , New , 1.08
28 , 0 , , 27 3.47 , 3.5 , Cur , 0.03
28 , 3 , , 27 3.48 , 3.47 , New , 0.01
26 , 0 , , 27 4.52 , 3.44 , New , 1.08
26 , 3 , , 27 4.55 , 3.46 , New , 1.09
29 , 0 , , 28 3.45 , 3.49 , Cur , 0.04
29 , 4 , , 28 3.5 , 3.5 , Eq , 0.0
27 , 0 , , 28 4.56 , 3.49 , New , 1.07
27 , 4 , , 28 4.5 , 3.49 , New , 1.01
30 , 0 , , 29 3.44 , 3.48 , Cur , 0.04
30 , 5 , , 29 3.46 , 3.47 , Cur , 0.01
28 , 0 , , 29 4.49 , 3.43 , New , 1.06
28 , 5 , , 29 4.57 , 3.45 , New , 1.12
31 , 0 , , 30 3.48 , 3.48 , Eq , 0.0
31 , 6 , , 30 3.46 , 3.49 , Cur , 0.03
29 , 0 , , 30 4.49 , 3.44 , New , 1.05
29 , 6 , , 30 4.53 , 3.44 , New , 1.09
32 , 0 , , 31 3.44 , 3.45 , Cur , 0.01
32 , 7 , , 31 3.46 , 3.51 , Cur , 0.05
30 , 0 , , 31 4.48 , 3.42 , New , 1.06
30 , 7 , , 31 4.48 , 3.44 , New , 1.04
Results For Skylake memchr-avx2
size , algn , Pos , Cur T , New T , Win , Dif
2048 , 0 , , 32 6.61 , 5.4 , New , 1.21
256 , 1 , , 64 6.52 , 5.68 , New , 0.84
2048 , 0 , , 64 6.03 , 5.47 , New , 0.56
256 , 2 , , 64 6.07 , 5.42 , New , 0.65
2048 , 0 , , 128 7.01 , 5.83 , New , 1.18
256 , 3 , , 64 6.24 , 5.68 , New , 0.56
2048 , 0 , , 256 11.03 , 9.86 , New , 1.17
256 , 4 , , 64 6.17 , 5.49 , New , 0.68
2048 , 0 , , 512 14.11 , 13.41 , New , 0.7
256 , 5 , , 64 6.03 , 5.45 , New , 0.58
2048 , 0 , , 1024 19.82 , 19.92 , Cur , 0.1
256 , 6 , , 64 6.14 , 5.7 , New , 0.44
2048 , 0 , , 2048 30.9 , 30.59 , New , 0.31
256 , 7 , , 64 6.05 , 5.64 , New , 0.41
192 , 1 , , 32 5.6 , 4.89 , New , 0.71
256 , 1 , , 32 5.59 , 5.07 , New , 0.52
512 , 1 , , 32 5.58 , 4.93 , New , 0.65
192 , 2 , , 64 6.14 , 5.46 , New , 0.68
512 , 2 , , 64 5.95 , 5.38 , New , 0.57
192 , 3 , , 96 6.6 , 5.74 , New , 0.86
256 , 3 , , 96 6.48 , 5.37 , New , 1.11
512 , 3 , , 96 6.56 , 5.44 , New , 1.12
192 , 4 , , 128 7.04 , 6.02 , New , 1.02
256 , 4 , , 128 6.96 , 5.89 , New , 1.07
512 , 4 , , 128 6.97 , 5.99 , New , 0.98
192 , 5 , , 160 8.49 , 7.07 , New , 1.42
256 , 5 , , 160 8.1 , 6.96 , New , 1.14
512 , 5 , , 160 10.48 , 9.14 , New , 1.34
192 , 6 , , 192 8.46 , 8.52 , Cur , 0.06
256 , 6 , , 192 8.53 , 7.58 , New , 0.95
512 , 6 , , 192 10.88 , 9.06 , New , 1.82
192 , 7 , , 224 8.59 , 8.35 , New , 0.24
256 , 7 , , 224 8.86 , 7.91 , New , 0.95
512 , 7 , , 224 10.89 , 8.98 , New , 1.91
2 , 0 , , 1 4.28 , 3.62 , New , 0.66
2 , 1 , , 1 4.32 , 3.75 , New , 0.57
0 , 0 , , 1 3.76 , 3.24 , New , 0.52
0 , 1 , , 1 3.7 , 3.19 , New , 0.51
3 , 0 , , 2 4.16 , 3.67 , New , 0.49
3 , 2 , , 2 4.21 , 3.68 , New , 0.53
1 , 0 , , 2 4.25 , 3.74 , New , 0.51
1 , 2 , , 2 4.4 , 3.82 , New , 0.58
4 , 0 , , 3 4.43 , 3.88 , New , 0.55
4 , 3 , , 3 4.34 , 3.8 , New , 0.54
2 , 0 , , 3 4.33 , 3.79 , New , 0.54
2 , 3 , , 3 4.37 , 3.84 , New , 0.53
5 , 0 , , 4 4.45 , 3.87 , New , 0.58
5 , 4 , , 4 4.41 , 3.84 , New , 0.57
3 , 0 , , 4 4.34 , 3.83 , New , 0.51
3 , 4 , , 4 4.35 , 3.82 , New , 0.53
6 , 0 , , 5 4.41 , 3.88 , New , 0.53
6 , 5 , , 5 4.41 , 3.88 , New , 0.53
4 , 0 , , 5 4.35 , 3.84 , New , 0.51
4 , 5 , , 5 4.37 , 3.85 , New , 0.52
7 , 0 , , 6 4.4 , 3.84 , New , 0.56
7 , 6 , , 6 4.39 , 3.83 , New , 0.56
5 , 0 , , 6 4.37 , 3.85 , New , 0.52
5 , 6 , , 6 4.4 , 3.86 , New , 0.54
8 , 0 , , 7 4.39 , 3.88 , New , 0.51
8 , 7 , , 7 4.4 , 3.83 , New , 0.57
6 , 0 , , 7 4.39 , 3.85 , New , 0.54
6 , 7 , , 7 4.38 , 3.87 , New , 0.51
9 , 0 , , 8 4.47 , 3.96 , New , 0.51
7 , 0 , , 8 4.37 , 3.85 , New , 0.52
10 , 0 , , 9 4.61 , 4.08 , New , 0.53
10 , 1 , , 9 4.61 , 4.09 , New , 0.52
8 , 0 , , 9 4.37 , 3.85 , New , 0.52
8 , 1 , , 9 4.37 , 3.85 , New , 0.52
11 , 0 , , 10 4.68 , 4.06 , New , 0.62
11 , 2 , , 10 4.56 , 4.1 , New , 0.46
9 , 0 , , 10 4.36 , 3.83 , New , 0.53
9 , 2 , , 10 4.37 , 3.83 , New , 0.54
12 , 0 , , 11 4.62 , 4.05 , New , 0.57
12 , 3 , , 11 4.63 , 4.06 , New , 0.57
10 , 0 , , 11 4.38 , 3.86 , New , 0.52
10 , 3 , , 11 4.41 , 3.86 , New , 0.55
13 , 0 , , 12 4.57 , 4.08 , New , 0.49
13 , 4 , , 12 4.59 , 4.12 , New , 0.47
11 , 0 , , 12 4.45 , 4.0 , New , 0.45
11 , 4 , , 12 4.51 , 4.04 , New , 0.47
14 , 0 , , 13 4.64 , 4.16 , New , 0.48
14 , 5 , , 13 4.67 , 4.1 , New , 0.57
12 , 0 , , 13 4.58 , 4.08 , New , 0.5
12 , 5 , , 13 4.6 , 4.1 , New , 0.5
15 , 0 , , 14 4.61 , 4.05 , New , 0.56
15 , 6 , , 14 4.59 , 4.06 , New , 0.53
13 , 0 , , 14 4.57 , 4.06 , New , 0.51
13 , 6 , , 14 4.57 , 4.05 , New , 0.52
16 , 0 , , 15 4.62 , 4.05 , New , 0.57
16 , 7 , , 15 4.63 , 4.06 , New , 0.57
14 , 0 , , 15 4.61 , 4.06 , New , 0.55
14 , 7 , , 15 4.59 , 4.05 , New , 0.54
17 , 0 , , 16 4.58 , 4.08 , New , 0.5
15 , 0 , , 16 4.64 , 4.06 , New , 0.58
18 , 0 , , 17 4.56 , 4.17 , New , 0.39
18 , 1 , , 17 4.59 , 4.09 , New , 0.5
16 , 0 , , 17 4.59 , 4.07 , New , 0.52
16 , 1 , , 17 4.58 , 4.04 , New , 0.54
19 , 0 , , 18 4.61 , 4.05 , New , 0.56
19 , 2 , , 18 4.6 , 4.08 , New , 0.52
17 , 0 , , 18 4.64 , 4.11 , New , 0.53
17 , 2 , , 18 4.56 , 4.13 , New , 0.43
20 , 0 , , 19 4.77 , 4.3 , New , 0.47
20 , 3 , , 19 4.6 , 4.14 , New , 0.46
18 , 0 , , 19 4.72 , 4.02 , New , 0.7
18 , 3 , , 19 4.53 , 4.01 , New , 0.52
21 , 0 , , 20 4.66 , 4.26 , New , 0.4
21 , 4 , , 20 4.74 , 4.07 , New , 0.67
19 , 0 , , 20 4.62 , 4.12 , New , 0.5
19 , 4 , , 20 4.57 , 4.04 , New , 0.53
22 , 0 , , 21 4.61 , 4.13 , New , 0.48
22 , 5 , , 21 4.64 , 4.08 , New , 0.56
20 , 0 , , 21 4.49 , 4.01 , New , 0.48
20 , 5 , , 21 4.58 , 4.06 , New , 0.52
23 , 0 , , 22 4.62 , 4.13 , New , 0.49
23 , 6 , , 22 4.72 , 4.27 , New , 0.45
21 , 0 , , 22 4.65 , 3.97 , New , 0.68
21 , 6 , , 22 4.5 , 4.02 , New , 0.48
24 , 0 , , 23 4.78 , 4.07 , New , 0.71
24 , 7 , , 23 4.67 , 4.23 , New , 0.44
22 , 0 , , 23 4.49 , 3.99 , New , 0.5
22 , 7 , , 23 4.56 , 4.03 , New , 0.53
25 , 0 , , 24 4.6 , 4.15 , New , 0.45
23 , 0 , , 24 4.57 , 4.06 , New , 0.51
26 , 0 , , 25 4.54 , 4.14 , New , 0.4
26 , 1 , , 25 4.72 , 4.1 , New , 0.62
24 , 0 , , 25 4.52 , 4.13 , New , 0.39
24 , 1 , , 25 4.55 , 4.0 , New , 0.55
27 , 0 , , 26 4.51 , 4.06 , New , 0.45
27 , 2 , , 26 4.53 , 4.16 , New , 0.37
25 , 0 , , 26 4.59 , 4.09 , New , 0.5
25 , 2 , , 26 4.55 , 4.01 , New , 0.54
28 , 0 , , 27 4.59 , 3.99 , New , 0.6
28 , 3 , , 27 4.57 , 3.95 , New , 0.62
26 , 0 , , 27 4.55 , 4.15 , New , 0.4
26 , 3 , , 27 4.57 , 3.99 , New , 0.58
29 , 0 , , 28 4.41 , 4.03 , New , 0.38
29 , 4 , , 28 4.59 , 4.02 , New , 0.57
27 , 0 , , 28 4.63 , 4.08 , New , 0.55
27 , 4 , , 28 4.44 , 4.02 , New , 0.42
30 , 0 , , 29 4.53 , 3.93 , New , 0.6
30 , 5 , , 29 4.55 , 3.88 , New , 0.67
28 , 0 , , 29 4.49 , 3.9 , New , 0.59
28 , 5 , , 29 4.44 , 3.94 , New , 0.5
31 , 0 , , 30 4.41 , 3.85 , New , 0.56
31 , 6 , , 30 4.48 , 3.86 , New , 0.62
29 , 0 , , 30 4.55 , 3.94 , New , 0.61
29 , 6 , , 30 4.32 , 3.95 , New , 0.37
32 , 0 , , 31 4.36 , 3.91 , New , 0.45
32 , 7 , , 31 4.37 , 3.89 , New , 0.48
30 , 0 , , 31 4.65 , 3.9 , New , 0.75
30 , 7 , , 31 4.42 , 3.93 , New , 0.49
sysdeps/x86_64/multiarch/memchr-evex.S | 580 +++++++++++++++----------
1 file changed, 349 insertions(+), 231 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 6dd5d67b90..65c16ef8a4 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -26,14 +26,28 @@
# ifdef USE_AS_WMEMCHR
# define VPBROADCAST vpbroadcastd
-# define VPCMP vpcmpd
-# define SHIFT_REG r8d
+# define VPMINU vpminud
+# define VPCMP vpcmpd
+# define VPCMPEQ vpcmpeqd
+# define CHAR_SIZE 4
# else
# define VPBROADCAST vpbroadcastb
-# define VPCMP vpcmpb
-# define SHIFT_REG ecx
+# define VPMINU vpminub
+# define VPCMP vpcmpb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_SIZE 1
# endif
+# ifdef USE_AS_RAWMEMCHR
+# define RAW_PTR_REG rcx
+# define ALGN_PTR_REG rdi
+# else
+# define RAW_PTR_REG rdi
+# define ALGN_PTR_REG rcx
+# endif
+
+#define XZERO xmm23
+#define YZERO ymm23
# define XMMMATCH xmm16
# define YMMMATCH ymm16
# define YMM1 ymm17
@@ -44,18 +58,16 @@
# define YMM6 ymm22
# define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE 4096
.section .text.evex,"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY(MEMCHR)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
test %RDX_LP, %RDX_LP
jz L(zero)
-# endif
- movl %edi, %ecx
-# ifdef USE_AS_WMEMCHR
- shl $2, %RDX_LP
-# else
+
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
@@ -63,319 +75,425 @@ ENTRY (MEMCHR)
# endif
/* Broadcast CHAR to YMMMATCH. */
VPBROADCAST %esi, %YMMMATCH
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ /* Check if we may cross page boundary with one
+ vector load. */
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
-
+ VPCMP $0, (%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
# ifndef USE_AS_RAWMEMCHR
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rdx
- jbe L(zero)
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $CHAR_PER_VEC, %rdx
+ jbe L(first_vec_x0)
+# endif
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the
+ wchar_t count. */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
- jnz L(first_vec_x0)
+ addq %rdi, %rax
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ ret
# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
- addq %rcx, %rdx
-
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
- jmp L(more_4x_vec)
+L(zero):
+ xorl %eax, %eax
+ ret
+ .p2align 5
+L(first_vec_x0):
+ /* Check if first match was before length. */
+ tzcntl %eax, %eax
+ xorl %ecx, %ecx
+ cmpl %eax, %edx
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+ cmovle %rcx, %rax
+ ret
+# else
+ /* NB: first_vec_x0 is 17 bytes which will leave
+ cross_page_boundary (which is relatively cold) close
+ enough to ideal alignment. So only realign
+ L(cross_page_boundary) if rawmemchr. */
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
+# endif
+L(cross_page_boundary):
+ /* Save pointer before aligning as its original
+ value is necessary for computer return address if byte is
+ found or adjusting length if it is not and this is
+ memchr. */
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx
+ for memchr and rdi for rawmemchr. */
+ andq $-VEC_SIZE, %ALGN_PTR_REG
+ VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+ kmovd %k0, %r8d
# ifdef USE_AS_WMEMCHR
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
- bytes. */
- movl %ecx, %SHIFT_REG
- sarl $2, %SHIFT_REG
+ /* NB: Divide shift count by 4 since each bit in
+ K0 represent 4 bytes. */
+ sarl $2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+ movl $(PAGE_SIZE / CHAR_SIZE), %esi
+ subl %eax, %esi
# endif
- andq $-VEC_SIZE, %rdi
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- /* Remove the leading bytes. */
- sarxl %SHIFT_REG, %eax, %eax
- testl %eax, %eax
- jz L(aligned_more)
- tzcntl %eax, %eax
# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+ andl $(CHAR_PER_VEC - 1), %eax
# endif
+ /* Remove the leading bytes. */
+ sarxl %eax, %r8d, %eax
# ifndef USE_AS_RAWMEMCHR
/* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ cmpq %rsi, %rdx
+ jbe L(first_vec_x0)
+# endif
+ testl %eax, %eax
+ jz L(cross_page_continue)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the
+ wchar_t count. */
+ leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+# else
+ addq %RAW_PTR_REG, %rax
# endif
- addq %rdi, %rax
- addq %rcx, %rax
ret
.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
- overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- /* Check the end of data. */
- subq %rcx, %rdx
- jbe L(zero)
-# endif
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- addq $VEC_SIZE, %rdi
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+ .p2align 5
+L(aligned_more):
+ /* Check the first 4 * VEC_SIZE. Only one
+ VEC_SIZE at a time since data is only aligned to
+ VEC_SIZE. */
# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
+ /* Align data to VEC_SIZE. */
+L(cross_page_continue):
+ xorl %ecx, %ecx
+ subl %edi, %ecx
+ andq $-VEC_SIZE, %rdi
+ /* esi is for adjusting length to see if near the
+ end. */
+ leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t
+ count. */
+ sarl $2, %esi
+# endif
+# else
+ andq $-VEC_SIZE, %rdi
+L(cross_page_continue):
+# endif
+ /* Load first VEC regardless. */
+ VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. If near end handle specially.
+ */
+ subq %rsi, %rdx
jbe L(last_4x_vec_or_less)
# endif
-
-L(more_4x_vec):
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
-
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
+
# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
+ /* Check if at last CHAR_PER_VEC * 4 length. */
+ subq $(CHAR_PER_VEC * 4), %rdx
+ jbe L(last_4x_vec_or_less_cmpeq)
+ addq $VEC_SIZE, %rdi
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
+ /* Align data to VEC_SIZE * 4 for the loop and
+ readjust length. */
+# ifdef USE_AS_WMEMCHR
+ movl %edi, %ecx
andq $-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
+ andl $(VEC_SIZE * 4 - 1), %ecx
+ /* NB: Divide bytes by 4 to get the wchar_t
+ count. */
+ sarl $2, %ecx
addq %rcx, %rdx
+# else
+ addq %rdi, %rdx
+ andq $-(4 * VEC_SIZE), %rdi
+ subq %rdi, %rdx
+# endif
+# else
+ addq $VEC_SIZE, %rdi
+ andq $-(4 * VEC_SIZE), %rdi
# endif
+ vpxorq %XZERO, %XZERO, %XZERO
+
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
- kord %k1, %k2, %k5
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
- kord %k3, %k4, %k6
- kortestd %k5, %k6
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
+ /* It would be possible to save some instructions
+ using 4x VPCMP but bottleneck on port 5 makes it not woth
+ it. */
+ VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+ /* xor will set bytes match esi to zero. */
+ vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+ vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+ VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+ /* Reduce VEC2 / VEC3 with min and VEC1 with zero
+ mask. */
+ VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
+ VPCMP $0, %YMM3, %YZERO, %k2
# ifdef USE_AS_RAWMEMCHR
- jmp L(loop_4x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
+ kortestd %k2, %k3
+ jz L(loop_4x_vec)
# else
- subq $(VEC_SIZE * 4), %rdx
- ja L(loop_4x_vec)
+ kortestd %k2, %k3
+ jnz L(loop_4x_vec_end)
-L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %edx
- jle L(last_2x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
+ subq $(CHAR_PER_VEC * 4), %rdx
+ ja L(loop_4x_vec)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ /* Fall through into less than 4 remaining
+ vectors of length case. */
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ addq $(VEC_SIZE * 3), %rdi
+ .p2align 4
+L(last_4x_vec_or_less):
+ /* Check if first VEC contained match. */
testl %eax, %eax
- jnz L(first_vec_x1)
+ jnz L(first_vec_x1_check)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+ /* If remaining length > CHAR_PER_VEC * 2. */
+ addl $(CHAR_PER_VEC * 2), %edx
+ jg L(last_4x_vec)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+L(last_2x_vec):
+ /* If remaining length < CHAR_PER_VEC. */
+ addl $CHAR_PER_VEC, %edx
+ jle L(zero_end)
+
+ /* Check VEC2 and compare any match with
+ remaining length. */
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ tzcntl %eax, %eax
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end):
+ ret
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x3_check)
+ .p2align 4
+L(first_vec_x1_check):
+ tzcntl %eax, %eax
+ /* Adjust length. */
+ subl $-(CHAR_PER_VEC * 4), %edx
+ /* Check if match within remaining length. */
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ /* NB: Multiply bytes by CHAR_SIZE to get the
+ wchar_t count. */
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+L(set_zero_end):
xorl %eax, %eax
ret
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %edx
- VPCMP $0, (%rdi), %YMMMATCH, %k1
+L(loop_4x_vec_end):
+# endif
+ /* rawmemchr will fall through into this if match
+ was found in loop. */
+
+ /* k1 has not of matches with VEC1. */
kmovd %k1, %eax
- testl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ subl $((1 << CHAR_PER_VEC) - 1), %eax
+# else
+ incl %eax
+# endif
+ jnz L(last_vec_x1_return)
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+ VPCMP $0, %YMM2, %YZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2_return)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ kmovd %k2, %eax
testl %eax, %eax
- jnz L(first_vec_x1_check)
- xorl %eax, %eax
- ret
+ jnz L(last_vec_x3_return)
- .p2align 4
-L(first_vec_x0_check):
+ kmovd %k3, %eax
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# ifdef USE_AS_RAWMEMCHR
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq %rdi, %rax
ret
.p2align 4
-L(first_vec_x1_check):
+L(last_vec_x1_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $VEC_SIZE, %rax
+# ifdef USE_AS_RAWMEMCHR
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the
+ wchar_t count. */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
addq %rdi, %rax
- ret
-
- .p2align 4
-L(first_vec_x2_check):
- tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# endif
+# else
+ /* NB: Multiply bytes by CHAR_SIZE to get the
+ wchar_t count. */
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
ret
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# ifdef USE_AS_RAWMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the
+ wchar_t count. */
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ /* NB: Multiply bytes by CHAR_SIZE to get the
+ wchar_t count. */
+ leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
ret
.p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-# endif
-
- .p2align 4
-L(first_vec_x0):
+L(last_vec_x3_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (%rdi, %rax, 4), %rax
+# ifdef USE_AS_RAWMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the
+ wchar_t count. */
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
# else
- addq %rdi, %rax
+ /* NB: Multiply bytes by CHAR_SIZE to get the
+ wchar_t count. */
+ leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
# endif
ret
+
+# ifndef USE_AS_RAWMEMCHR
+L(last_4x_vec_or_less_cmpeq):
+ VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Check first VEC regardless. */
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+
+ /* If remaining length <= CHAR_PER_VEC * 2. */
+ addl $(CHAR_PER_VEC * 2), %edx
+ jle L(last_2x_vec)
+
.p2align 4
-L(first_vec_x1):
+L(last_4x_vec):
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ /* Create mask for possible matches within
+ remaining length. */
+# ifdef USE_AS_WMEMCHR
+ movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+ bzhil %edx, %ecx, %ecx
+# else
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
+# endif
+ /* Test matches in data against length match. */
+ andl %ecx, %eax
+ jnz L(last_vec_x3)
+
+ /* if remaining length <= CHAR_PER_VEC * 3 (Note
+ this is after remaining length was found to be >
+ CHAR_PER_VEC * 2. */
+ subl $CHAR_PER_VEC, %edx
+ jbe L(zero_end2)
+
+
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ /* Shift remaining length mask for last VEC. */
+# ifdef USE_AS_WMEMCHR
+ shrl $CHAR_PER_VEC, %ecx
+# else
+ shrq $CHAR_PER_VEC, %rcx
+# endif
+ andl %ecx, %eax
+ jz L(zero_end2)
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end2):
ret
- .p2align 4
-L(first_vec_x2):
+L(last_vec_x2):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
-# else
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
ret
.p2align 4
-L(4x_vec_end):
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- kmovd %k2, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- kmovd %k3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- kmovd %k4, %eax
- testl %eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
-# else
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
ret
+# endif
-END (MEMCHR)
+END(MEMCHR)
#endif
--
2.29.2
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v1 1/3] Bench: Expand bench-memchr.c
2021-05-03 8:44 [PATCH v1 1/3] Bench: Expand bench-memchr.c Noah Goldstein
2021-05-03 8:44 ` [PATCH v1 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
2021-05-03 8:44 ` [PATCH v1 3/3] x86: Optimize memchr-evex.S Noah Goldstein
@ 2021-05-03 17:17 ` H.J. Lu
2021-05-03 19:51 ` Noah Goldstein
2 siblings, 1 reply; 20+ messages in thread
From: H.J. Lu @ 2021-05-03 17:17 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Mon, May 3, 2021 at 1:45 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug. This commit adds some additional cases for bench-memchr.c
> including testing medium sizes and testing short length with both an
> inbound match and out of bound match.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> benchtests/bench-memchr.c | 13 +++++++++++++
> 1 file changed, 13 insertions(+)
>
> diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
> index f5ced9d80d..5573f93312 100644
> --- a/benchtests/bench-memchr.c
> +++ b/benchtests/bench-memchr.c
> @@ -135,12 +135,25 @@ test_main (void)
> do_test (i, i, 256, 0);
> #endif
> }
> + for (i = 1; i < 8; ++i)
> + {
> + do_test (i, i << 5, 192, 23);
> + do_test (i, i << 5, 192, 0);
> + do_test (i, i << 5, 256, 23);
> + do_test (i, i << 5, 256, 0);
> + do_test (i, i << 5, 512, 23);
> + do_test (i, i << 5, 512, 0);
> + }
> for (i = 1; i < 32; ++i)
> {
> do_test (0, i, i + 1, 23);
> do_test (0, i, i + 1, 0);
> do_test (i, i, i + 1, 23);
> do_test (i, i, i + 1, 0);
> + do_test (0, i, i - 1, 23);
> + do_test (0, i, i - 1, 0);
> + do_test (i, i, i - 1, 23);
> + do_test (i, i, i - 1, 0);
> #ifdef USE_AS_MEMRCHR
> /* Also test the position close to the beginning for memrchr. */
> do_test (0, 1, i + 1, 23);
> --
> 2.29.2
>
LGTM. I will check it in for you.
BTW, can you apply an account on sourceware.org:
https://sourceware.org/
so that you can push your commits directly? You can put me down
as your sponsor.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v1 2/3] x86: Optimize memchr-avx2.S
2021-05-03 8:44 ` [PATCH v1 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
@ 2021-05-03 18:50 ` H.J. Lu
2021-05-03 20:06 ` Noah Goldstein
2021-05-03 20:06 ` [PATCH v2 " Noah Goldstein
2021-05-03 22:58 ` [PATCH v3 " Noah Goldstein
2 siblings, 1 reply; 20+ messages in thread
From: H.J. Lu @ 2021-05-03 18:50 UTC (permalink / raw)
To: Noah Goldstein; +Cc: libc-alpha, carlos, hjl.tools
On Mon, May 03, 2021 at 04:44:36AM -0400, Noah Goldstein wrote:
> No bug. This commit optimizes memchr-avx2.S. The optimizations include
> replacing some branches with cmovcc, avoiding some branches entirely
> in the less_4x_vec case, making the page cross logic less strict,
> asaving a few instructions the in loop return loop. test-memchr,
> test-rawmemchr, and test-wmemchr are all passing.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> sysdeps/x86_64/multiarch/memchr-avx2.S | 446 +++++++++++++++----------
> 1 file changed, 262 insertions(+), 184 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> index 1fcb1c350f..8368fcd1e1 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> @@ -26,8 +26,22 @@
>
> # ifdef USE_AS_WMEMCHR
> # define VPCMPEQ vpcmpeqd
> +# define VPBROADCAST vpbroadcastd
> +# define CHAR_SIZE 4
> # else
> # define VPCMPEQ vpcmpeqb
> +# define VPBROADCAST vpbroadcastb
> +# define CHAR_SIZE 1
> +# endif
> +
> +# ifdef USE_AS_RAWMEMCHR
> +# define ERAW_PTR_REG ecx
> +# define RRAW_PTR_REG rcx
> +# define ALGN_PTR_REG rdi
> +# else
> +# define ERAW_PTR_REG edi
> +# define RRAW_PTR_REG rdi
> +# define ALGN_PTR_REG rcx
> # endif
>
> # ifndef VZEROUPPER
> @@ -39,303 +53,367 @@
> # endif
>
> # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
> +
Remove the extra line here.
>
> .section SECTION(.text),"ax",@progbits
> -ENTRY (MEMCHR)
> +ENTRY(MEMCHR)
No need for this change.
> # ifndef USE_AS_RAWMEMCHR
> /* Check for zero length. */
> test %RDX_LP, %RDX_LP
> jz L(null)
> # endif
> - movl %edi, %ecx
> - /* Broadcast CHAR to YMM0. */
> - vmovd %esi, %xmm0
> # ifdef USE_AS_WMEMCHR
> shl $2, %RDX_LP
> - vpbroadcastd %xmm0, %ymm0
> # else
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> movl %edx, %edx
> # endif
> - vpbroadcastb %xmm0, %ymm0
> # endif
> - /* Check if we may cross page boundary with one vector load. */
> - andl $(2 * VEC_SIZE - 1), %ecx
> - cmpl $VEC_SIZE, %ecx
> - ja L(cros_page_boundary)
> + /* Broadcast CHAR to YMMMATCH. */
> + vmovd %esi, %xmm0
> + VPBROADCAST %xmm0, %ymm0
> + /* Check if we may cross page boundary with one
> + vector load. */
> + movl %edi, %eax
> + andl $(PAGE_SIZE - 1), %eax
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> + ja L(cross_page_boundary)
>
> /* Check the first VEC_SIZE bytes. */
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> + VPCMPEQ (%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> -
> # ifndef USE_AS_RAWMEMCHR
> - jnz L(first_vec_x0_check)
> - /* Adjust length and check the end of data. */
> - subq $VEC_SIZE, %rdx
> - jbe L(zero)
> -# else
> - jnz L(first_vec_x0)
> + /* If length < CHAR_PER_VEC handle special. */
> + cmpq $VEC_SIZE, %rdx
> + jbe L(first_vec_x0)
> # endif
> -
> - /* Align data for aligned loads in the loop. */
> - addq $VEC_SIZE, %rdi
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> + testl %eax, %eax
> + jz L(aligned_more)
> + tzcntl %eax, %eax
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> # ifndef USE_AS_RAWMEMCHR
> - /* Adjust length. */
> - addq %rcx, %rdx
> -
> - subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> + .p2align 5
> +L(first_vec_x0):
> + /* Check if first match was before length. */
> + tzcntl %eax, %eax
> + xorl %ecx, %ecx
> + cmpl %eax, %edx
> + leaq (%rdi, %rax), %rax
> + cmovle %rcx, %rax
> + VZEROUPPER_RETURN
> +L(null):
> + xorl %eax, %eax
> + ret
> # endif
> - jmp L(more_4x_vec)
> -
> .p2align 4
> -L(cros_page_boundary):
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> +L(cross_page_boundary):
> + /* Save pointer before aligning as its original
> + value is necessary for computer return address if byte is
> + found or adjusting length if it is not and this is
Fit comments to 72 columns.
> + memchr. */
> + movq %rdi, %rcx
> + /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is
> + rcx for memchr and rdi for rawmemchr. */
> + orq $(VEC_SIZE - 1), %ALGN_PTR_REG
> + VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> + /* Calculate length until end of page (length
> + checked for a match). */
> + leaq 1(%ALGN_PTR_REG), %rsi
> + subq %RRAW_PTR_REG, %rsi
> +# endif
> /* Remove the leading bytes. */
> - sarl %cl, %eax
> - testl %eax, %eax
> - jz L(aligned_more)
> - tzcntl %eax, %eax
> + sarxl %ERAW_PTR_REG, %eax, %eax
> # ifndef USE_AS_RAWMEMCHR
> /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> + cmpq %rsi, %rdx
> + jbe L(first_vec_x0)
> # endif
> - addq %rdi, %rax
> - addq %rcx, %rax
> + testl %eax, %eax
> + jz L(cross_page_continue)
> + tzcntl %eax, %eax
> + addq %RRAW_PTR_REG, %rax
> L(return_vzeroupper):
> ZERO_UPPER_VEC_REGISTERS_RETURN
>
> .p2align 4
> -L(aligned_more):
> -# ifndef USE_AS_RAWMEMCHR
> - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> - overflow. */
> - negq %rcx
> - addq $VEC_SIZE, %rcx
> +L(first_vec_x1):
> + tzcntl %eax, %eax
> + incq %rdi
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> - /* Check the end of data. */
> - subq %rcx, %rdx
> - jbe L(zero)
> -# endif
> + .p2align 4
> +L(first_vec_x2):
> + tzcntl %eax, %eax
> + addq $(VEC_SIZE + 1), %rdi
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> - addq $VEC_SIZE, %rdi
> + .p2align 4
> +L(first_vec_x3):
> + tzcntl %eax, %eax
> + addq $(VEC_SIZE * 2 + 1), %rdi
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> -# ifndef USE_AS_RAWMEMCHR
> - subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> -# endif
>
> -L(more_4x_vec):
> - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> - since data is only aligned to VEC_SIZE. */
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
> + .p2align 4
> +L(first_vec_x4):
> + tzcntl %eax, %eax
> + addq $(VEC_SIZE * 3 + 1), %rdi
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> + .p2align 4
> +L(aligned_more):
> + /* Check the first 4 * VEC_SIZE. Only one
> + VEC_SIZE at a time since data is only aligned to
> + VEC_SIZE. */
Fit comments to 72 columns.
> +
> +# ifndef USE_AS_RAWMEMCHR
> +L(cross_page_continue):
> + /* Align data to VEC_SIZE - 1. */
> + xorl %ecx, %ecx
> + subl %edi, %ecx
> + orq $(VEC_SIZE - 1), %rdi
> + /* esi is for adjusting length to see if near the
> + end. */
Fit comments to 72 columns.
> + leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> +# else
> + orq $(VEC_SIZE - 1), %rdi
> +L(cross_page_continue):
> +# endif
> + /* Load first VEC regardless. */
> + VPCMPEQ 1(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> + /* Adjust length. If near end handle specially.
> + */
Put the comments on one line.
> + subq %rsi, %rdx
> + jbe L(last_4x_vec_or_less)
> +# endif
> testl %eax, %eax
> jnz L(first_vec_x1)
>
> - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x2)
>
> - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x3)
>
> - addq $(VEC_SIZE * 4), %rdi
> + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
> + testl %eax, %eax
> + jnz L(first_vec_x4)
>
> # ifndef USE_AS_RAWMEMCHR
> + /* Check if at last VEC_SIZE * 4 length. */
> subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> -# endif
> -
> - /* Align data to 4 * VEC_SIZE. */
> - movq %rdi, %rcx
> - andl $(4 * VEC_SIZE - 1), %ecx
> - andq $-(4 * VEC_SIZE), %rdi
> -
> -# ifndef USE_AS_RAWMEMCHR
> - /* Adjust length. */
> + jbe L(last_4x_vec_or_less_cmpeq)
> + /* Align data to VEC_SIZE * 4 - 1 for the loop
> + and readjust length. */
> + incq %rdi
> + movl %edi, %ecx
> + orq $(VEC_SIZE * 4 - 1), %rdi
> + andl $(VEC_SIZE * 4 - 1), %ecx
> addq %rcx, %rdx
> +# else
> + /* Align data to VEC_SIZE * 4 - 1 for loop. */
> + incq %rdi
> + orq $(VEC_SIZE * 4 - 1), %rdi
> # endif
>
> + /* Compare 4 * VEC at a time forward. */
> .p2align 4
> L(loop_4x_vec):
> - /* Compare 4 * VEC at a time forward. */
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
> - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
> - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
> -
> + VPCMPEQ 1(%rdi), %ymm0, %ymm1
> + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
> + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
> + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
> vpor %ymm1, %ymm2, %ymm5
> vpor %ymm3, %ymm4, %ymm6
> vpor %ymm5, %ymm6, %ymm5
>
> - vpmovmskb %ymm5, %eax
> - testl %eax, %eax
> - jnz L(4x_vec_end)
> -
> - addq $(VEC_SIZE * 4), %rdi
> -
> + vpmovmskb %ymm5, %ecx
> # ifdef USE_AS_RAWMEMCHR
> - jmp L(loop_4x_vec)
> + subq $-(VEC_SIZE * 4), %rdi
> + testl %ecx, %ecx
> + jz L(loop_4x_vec)
> # else
> - subq $(VEC_SIZE * 4), %rdx
> - ja L(loop_4x_vec)
> + testl %ecx, %ecx
> + jnz L(loop_4x_vec_end)
>
> -L(last_4x_vec_or_less):
> - /* Less than 4 * VEC and aligned to VEC_SIZE. */
> - addl $(VEC_SIZE * 2), %edx
> - jle L(last_2x_vec)
> + subq $-(VEC_SIZE * 4), %rdi
>
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
> + subq $(VEC_SIZE * 4), %rdx
> + ja L(loop_4x_vec)
>
> - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> + /* Fall through into less than 4 remaining
> + vectors of length case. */
Fit comments to 72 columns.
> + VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> + .p2align 4
> +L(last_4x_vec_or_less):
> + /* Check if first VEC contained match. */
> testl %eax, %eax
> - jnz L(first_vec_x1)
> + jnz L(first_vec_x1_check)
>
> - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> + /* If remaining length > VEC_SIZE * 2. */
> + addl $(VEC_SIZE * 2), %edx
> + jg L(last_4x_vec)
>
> - jnz L(first_vec_x2_check)
> - subl $VEC_SIZE, %edx
> - jle L(zero)
> +L(last_2x_vec):
> + /* If remaining length < VEC_SIZE. */
> + addl $VEC_SIZE, %edx
> + jle L(zero_end)
>
> - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> + /* Check VEC2 and compare any match with
> + remaining length. */
Fit comments to 72 columns.
> + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> -
> - jnz L(first_vec_x3_check)
> - xorl %eax, %eax
> + tzcntl %eax, %eax
> + cmpl %eax, %edx
> + jbe L(set_zero_end)
> + addq $(VEC_SIZE + 1), %rdi
> + addq %rdi, %rax
> +L(zero_end):
> VZEROUPPER_RETURN
>
> .p2align 4
> -L(last_2x_vec):
> - addl $(VEC_SIZE * 2), %edx
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> +L(loop_4x_vec_end):
> +# endif
> + /* rawmemchr will fall through into this if match
> + was found in loop. */
Fit comments to 72 columns.
> +
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> + jnz L(last_vec_x1_return)
>
> - jnz L(first_vec_x0_check)
> - subl $VEC_SIZE, %edx
> - jle L(zero)
> -
> - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm2, %eax
> testl %eax, %eax
> - jnz L(first_vec_x1_check)
> - xorl %eax, %eax
> - VZEROUPPER_RETURN
> + jnz L(last_vec_x2_return)
>
> - .p2align 4
> -L(first_vec_x0_check):
> - tzcntl %eax, %eax
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> + vpmovmskb %ymm3, %eax
> + /* Combine VEC3 matches (eax) with VEC4 matches
> + (ecx). */
Fit comments to 72 columns.
> + salq $32, %rcx
> + orq %rcx, %rax
> + tzcntq %rax, %rax
> +# ifdef USE_AS_RAWMEMCHR
> + subq $(VEC_SIZE * 2 - 1), %rdi
> +# else
> + subq $-(VEC_SIZE * 2 + 1), %rdi
> +# endif
> addq %rdi, %rax
> VZEROUPPER_RETURN
> +# ifndef USE_AS_RAWMEMCHR
>
> .p2align 4
> L(first_vec_x1_check):
> tzcntl %eax, %eax
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $VEC_SIZE, %rax
> + /* Adjust length. */
> + subl $-(VEC_SIZE * 4), %edx
> + /* Check if match within remaining length. */
> + cmpl %eax, %edx
> + jbe L(set_zero_end)
> + incq %rdi
> addq %rdi, %rax
> VZEROUPPER_RETURN
> + .p2align 4
> +L(set_zero_end):
> + xorl %eax, %eax
> + VZEROUPPER_RETURN
> +# endif
>
> .p2align 4
> -L(first_vec_x2_check):
> +L(last_vec_x1_return):
> tzcntl %eax, %eax
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $(VEC_SIZE * 2), %rax
> +# ifdef USE_AS_RAWMEMCHR
> + subq $(VEC_SIZE * 4 - 1), %rdi
> +# else
> + incq %rdi
> +# endif
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> .p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x2_return):
> tzcntl %eax, %eax
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $(VEC_SIZE * 3), %rax
> +# ifdef USE_AS_RAWMEMCHR
> + subq $(VEC_SIZE * 3 - 1), %rdi
> +# else
> + subq $-(VEC_SIZE + 1), %rdi
> +# endif
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> +# ifndef USE_AS_RAWMEMCHR
> .p2align 4
> -L(zero):
> - xorl %eax, %eax
> - jmp L(return_vzeroupper)
> +L(last_4x_vec_or_less_cmpeq):
> + VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
> + subq $-(VEC_SIZE * 4), %rdi
> + /* Check first VEC regardless. */
> + testl %eax, %eax
> + jnz L(first_vec_x1_check)
>
> + /* If remaining length <= CHAR_PER_VEC * 2. */
> + addl $(VEC_SIZE * 2), %edx
> + jle L(last_2x_vec)
> .p2align 4
> -L(null):
> - xorl %eax, %eax
> - ret
> -# endif
> +L(last_4x_vec):
> + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
> + testl %eax, %eax
> + jnz L(last_vec_x2_return)
>
> - .p2align 4
> -L(first_vec_x0):
> - tzcntl %eax, %eax
> - addq %rdi, %rax
> - VZEROUPPER_RETURN
> + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
>
> - .p2align 4
> -L(first_vec_x1):
> - tzcntl %eax, %eax
> - addq $VEC_SIZE, %rax
> - addq %rdi, %rax
> - VZEROUPPER_RETURN
> + /* Create mask for possible matches within
> + remaining length. */
Fit comments to 72 columns.
> + movq $-1, %rcx
> + bzhiq %rdx, %rcx, %rcx
>
> - .p2align 4
> -L(first_vec_x2):
> + /* Test matches in data against length match. */
> + andl %ecx, %eax
> + jnz L(last_vec_x3)
> +
> + /* if remaining length <= VEC_SIZE * 3 (Note this
> + is after remaining length was found to be > VEC_SIZE * 2.
Fit comments to 72 columns.
> + */
> + subl $VEC_SIZE, %edx
> + jbe L(zero_end2)
> +
> + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
> + /* Shift remaining length mask for last VEC. */
> + shrq $32, %rcx
> + andl %ecx, %eax
> + jz L(zero_end2)
> tzcntl %eax, %eax
> - addq $(VEC_SIZE * 2), %rax
> + addq $(VEC_SIZE * 3 + 1), %rdi
> addq %rdi, %rax
> +L(zero_end2):
> VZEROUPPER_RETURN
>
> .p2align 4
> -L(4x_vec_end):
> - vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
> - vpmovmskb %ymm2, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x1)
> - vpmovmskb %ymm3, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x2)
> - vpmovmskb %ymm4, %eax
> - testl %eax, %eax
> -L(first_vec_x3):
> +L(last_vec_x3):
> tzcntl %eax, %eax
> - addq $(VEC_SIZE * 3), %rax
> + subq $-(VEC_SIZE * 2 + 1), %rdi
> addq %rdi, %rax
> VZEROUPPER_RETURN
> +# endif
>
> -END (MEMCHR)
> +END(MEMCHR)
No need for this change.
> #endif
> --
> 2.29.2
>
Thanks.
H.J.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v1 3/3] x86: Optimize memchr-evex.S
2021-05-03 8:44 ` [PATCH v1 3/3] x86: Optimize memchr-evex.S Noah Goldstein
@ 2021-05-03 18:58 ` H.J. Lu
2021-05-03 20:06 ` Noah Goldstein
0 siblings, 1 reply; 20+ messages in thread
From: H.J. Lu @ 2021-05-03 18:58 UTC (permalink / raw)
To: Noah Goldstein; +Cc: libc-alpha, carlos, hjl.tools
On Mon, May 03, 2021 at 04:44:38AM -0400, Noah Goldstein wrote:
> No bug. This commit optimizes memchr-evex.S. The optimizations include
> replacing some branches with cmovcc, avoiding some branches entirely
> in the less_4x_vec case, making the page cross logic less strict,
> saving some ALU in the alignment process, and most importantly
> increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
> test-wmemchr are all passing.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> Tests where run on the following CPUs:
>
> Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
>
> Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html
>
> Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html
>
> All times are the geometric mean of N=20. The unit of time is
> seconds.
>
> "Cur" refers to the current implementation
> "New" refers to this patches implementation
>
> Note: The numbers for size = [1, 32] are highly dependent on function
> alignment. That being said the new implementation which uses cmovcc
> instead of a branch (mostly for the reason of high variance with
> different alignments) for the [1, 32] case is far more consistent and
> performs about as well (and should only be a bigger improvement in
> cases where the sizes / position are not 100% predictable).
>
> For memchr-evex the numbers are a near universal improvement. The case
> where the current implement as better is for size = 0 and for size =
> [1, 32] with pos < size the two implementations are about the
> same. For size = [1, 32] with pos > size, for medium range sizes, and
> large size, however, the new implementation is faster.
>
> Results For Tigerlake memchr-evex
> size , algn , Pos , Cur T , New T , Win , Dif
> 2048 , 0 , , 32 5.58 , 5.22 , New , 0.36
> 256 , 1 , , 64 5.22 , 4.93 , New , 0.29
> 2048 , 0 , , 64 5.22 , 4.89 , New , 0.33
> 256 , 2 , , 64 5.14 , 4.81 , New , 0.33
> 2048 , 0 , , 128 6.3 , 5.67 , New , 0.63
> 256 , 3 , , 64 5.22 , 4.9 , New , 0.32
> 2048 , 0 , , 256 11.07 , 10.92 , New , 0.15
> 256 , 4 , , 64 5.16 , 4.86 , New , 0.3
> 2048 , 0 , , 512 15.66 , 14.81 , New , 0.85
> 256 , 5 , , 64 5.15 , 4.84 , New , 0.31
> 2048 , 0 , , 1024 25.7 , 23.02 , New , 2.68
> 256 , 6 , , 64 5.12 , 4.89 , New , 0.23
> 2048 , 0 , , 2048 42.34 , 37.71 , New , 4.63
> 256 , 7 , , 64 5.03 , 4.62 , New , 0.41
> 192 , 1 , , 32 4.96 , 4.28 , New , 0.68
> 256 , 1 , , 32 4.95 , 4.28 , New , 0.67
> 512 , 1 , , 32 4.94 , 4.29 , New , 0.65
> 192 , 2 , , 64 5.1 , 4.8 , New , 0.3
> 512 , 2 , , 64 5.12 , 4.72 , New , 0.4
> 192 , 3 , , 96 5.54 , 5.12 , New , 0.42
> 256 , 3 , , 96 5.52 , 5.15 , New , 0.37
> 512 , 3 , , 96 5.51 , 5.16 , New , 0.35
> 192 , 4 , , 128 6.1 , 5.53 , New , 0.57
> 256 , 4 , , 128 6.09 , 5.49 , New , 0.6
> 512 , 4 , , 128 6.08 , 5.48 , New , 0.6
> 192 , 5 , , 160 7.42 , 6.71 , New , 0.71
> 256 , 5 , , 160 6.86 , 6.71 , New , 0.15
> 512 , 5 , , 160 9.28 , 8.68 , New , 0.6
> 192 , 6 , , 192 7.94 , 7.47 , New , 0.47
> 256 , 6 , , 192 7.62 , 7.17 , New , 0.45
> 512 , 6 , , 192 9.2 , 9.16 , New , 0.04
> 192 , 7 , , 224 8.02 , 7.43 , New , 0.59
> 256 , 7 , , 224 8.34 , 7.85 , New , 0.49
> 512 , 7 , , 224 9.89 , 9.16 , New , 0.73
> 2 , 0 , , 1 3.0 , 3.0 , Eq , 0.0
> 2 , 1 , , 1 3.0 , 3.0 , Eq , 0.0
> 0 , 0 , , 1 3.01 , 3.6 , Cur , 0.59
> 0 , 1 , , 1 3.01 , 3.6 , Cur , 0.59
> 3 , 0 , , 2 3.0 , 3.0 , Eq , 0.0
> 3 , 2 , , 2 3.0 , 3.0 , Eq , 0.0
> 1 , 0 , , 2 3.6 , 3.0 , New , 0.6
> 1 , 2 , , 2 3.6 , 3.0 , New , 0.6
> 4 , 0 , , 3 3.01 , 3.01 , Eq , 0.0
> 4 , 3 , , 3 3.01 , 3.01 , Eq , 0.0
> 2 , 0 , , 3 3.62 , 3.02 , New , 0.6
> 2 , 3 , , 3 3.62 , 3.03 , New , 0.59
> 5 , 0 , , 4 3.02 , 3.03 , Cur , 0.01
> 5 , 4 , , 4 3.02 , 3.02 , Eq , 0.0
> 3 , 0 , , 4 3.63 , 3.02 , New , 0.61
> 3 , 4 , , 4 3.63 , 3.04 , New , 0.59
> 6 , 0 , , 5 3.05 , 3.04 , New , 0.01
> 6 , 5 , , 5 3.02 , 3.02 , Eq , 0.0
> 4 , 0 , , 5 3.63 , 3.02 , New , 0.61
> 4 , 5 , , 5 3.64 , 3.03 , New , 0.61
> 7 , 0 , , 6 3.03 , 3.03 , Eq , 0.0
> 7 , 6 , , 6 3.02 , 3.02 , Eq , 0.0
> 5 , 0 , , 6 3.64 , 3.01 , New , 0.63
> 5 , 6 , , 6 3.64 , 3.03 , New , 0.61
> 8 , 0 , , 7 3.03 , 3.04 , Cur , 0.01
> 8 , 7 , , 7 3.04 , 3.04 , Eq , 0.0
> 6 , 0 , , 7 3.67 , 3.04 , New , 0.63
> 6 , 7 , , 7 3.65 , 3.05 , New , 0.6
> 9 , 0 , , 8 3.05 , 3.05 , Eq , 0.0
> 7 , 0 , , 8 3.67 , 3.05 , New , 0.62
> 10 , 0 , , 9 3.06 , 3.06 , Eq , 0.0
> 10 , 1 , , 9 3.06 , 3.06 , Eq , 0.0
> 8 , 0 , , 9 3.67 , 3.06 , New , 0.61
> 8 , 1 , , 9 3.67 , 3.06 , New , 0.61
> 11 , 0 , , 10 3.06 , 3.06 , Eq , 0.0
> 11 , 2 , , 10 3.07 , 3.06 , New , 0.01
> 9 , 0 , , 10 3.67 , 3.05 , New , 0.62
> 9 , 2 , , 10 3.67 , 3.06 , New , 0.61
> 12 , 0 , , 11 3.06 , 3.06 , Eq , 0.0
> 12 , 3 , , 11 3.06 , 3.06 , Eq , 0.0
> 10 , 0 , , 11 3.67 , 3.06 , New , 0.61
> 10 , 3 , , 11 3.67 , 3.06 , New , 0.61
> 13 , 0 , , 12 3.06 , 3.07 , Cur , 0.01
> 13 , 4 , , 12 3.06 , 3.07 , Cur , 0.01
> 11 , 0 , , 12 3.67 , 3.11 , New , 0.56
> 11 , 4 , , 12 3.68 , 3.12 , New , 0.56
> 14 , 0 , , 13 3.07 , 3.1 , Cur , 0.03
> 14 , 5 , , 13 3.06 , 3.07 , Cur , 0.01
> 12 , 0 , , 13 3.67 , 3.07 , New , 0.6
> 12 , 5 , , 13 3.67 , 3.08 , New , 0.59
> 15 , 0 , , 14 3.06 , 3.06 , Eq , 0.0
> 15 , 6 , , 14 3.07 , 3.06 , New , 0.01
> 13 , 0 , , 14 3.67 , 3.06 , New , 0.61
> 13 , 6 , , 14 3.68 , 3.06 , New , 0.62
> 16 , 0 , , 15 3.06 , 3.06 , Eq , 0.0
> 16 , 7 , , 15 3.06 , 3.05 , New , 0.01
> 14 , 0 , , 15 3.68 , 3.06 , New , 0.62
> 14 , 7 , , 15 3.67 , 3.06 , New , 0.61
> 17 , 0 , , 16 3.07 , 3.06 , New , 0.01
> 15 , 0 , , 16 3.68 , 3.06 , New , 0.62
> 18 , 0 , , 17 3.06 , 3.06 , Eq , 0.0
> 18 , 1 , , 17 3.06 , 3.06 , Eq , 0.0
> 16 , 0 , , 17 3.67 , 3.06 , New , 0.61
> 16 , 1 , , 17 3.67 , 3.05 , New , 0.62
> 19 , 0 , , 18 3.07 , 3.06 , New , 0.01
> 19 , 2 , , 18 3.06 , 3.06 , Eq , 0.0
> 17 , 0 , , 18 3.68 , 3.08 , New , 0.6
> 17 , 2 , , 18 3.68 , 3.06 , New , 0.62
> 20 , 0 , , 19 3.06 , 3.06 , Eq , 0.0
> 20 , 3 , , 19 3.06 , 3.06 , Eq , 0.0
> 18 , 0 , , 19 3.68 , 3.06 , New , 0.62
> 18 , 3 , , 19 3.68 , 3.06 , New , 0.62
> 21 , 0 , , 20 3.06 , 3.06 , Eq , 0.0
> 21 , 4 , , 20 3.06 , 3.06 , Eq , 0.0
> 19 , 0 , , 20 3.67 , 3.06 , New , 0.61
> 19 , 4 , , 20 3.67 , 3.06 , New , 0.61
> 22 , 0 , , 21 3.06 , 3.06 , Eq , 0.0
> 22 , 5 , , 21 3.06 , 3.06 , Eq , 0.0
> 20 , 0 , , 21 3.67 , 3.05 , New , 0.62
> 20 , 5 , , 21 3.68 , 3.06 , New , 0.62
> 23 , 0 , , 22 3.07 , 3.06 , New , 0.01
> 23 , 6 , , 22 3.06 , 3.06 , Eq , 0.0
> 21 , 0 , , 22 3.68 , 3.07 , New , 0.61
> 21 , 6 , , 22 3.67 , 3.06 , New , 0.61
> 24 , 0 , , 23 3.19 , 3.06 , New , 0.13
> 24 , 7 , , 23 3.08 , 3.06 , New , 0.02
> 22 , 0 , , 23 3.69 , 3.06 , New , 0.63
> 22 , 7 , , 23 3.68 , 3.06 , New , 0.62
> 25 , 0 , , 24 3.07 , 3.06 , New , 0.01
> 23 , 0 , , 24 3.68 , 3.06 , New , 0.62
> 26 , 0 , , 25 3.06 , 3.05 , New , 0.01
> 26 , 1 , , 25 3.07 , 3.06 , New , 0.01
> 24 , 0 , , 25 3.67 , 3.05 , New , 0.62
> 24 , 1 , , 25 3.68 , 3.06 , New , 0.62
> 27 , 0 , , 26 3.12 , 3.06 , New , 0.06
> 27 , 2 , , 26 3.08 , 3.06 , New , 0.02
> 25 , 0 , , 26 3.69 , 3.06 , New , 0.63
> 25 , 2 , , 26 3.67 , 3.06 , New , 0.61
> 28 , 0 , , 27 3.06 , 3.06 , Eq , 0.0
> 28 , 3 , , 27 3.06 , 3.06 , Eq , 0.0
> 26 , 0 , , 27 3.67 , 3.06 , New , 0.61
> 26 , 3 , , 27 3.67 , 3.06 , New , 0.61
> 29 , 0 , , 28 3.06 , 3.06 , Eq , 0.0
> 29 , 4 , , 28 3.06 , 3.06 , Eq , 0.0
> 27 , 0 , , 28 3.68 , 3.05 , New , 0.63
> 27 , 4 , , 28 3.67 , 3.06 , New , 0.61
> 30 , 0 , , 29 3.06 , 3.06 , Eq , 0.0
> 30 , 5 , , 29 3.06 , 3.06 , Eq , 0.0
> 28 , 0 , , 29 3.67 , 3.06 , New , 0.61
> 28 , 5 , , 29 3.68 , 3.06 , New , 0.62
> 31 , 0 , , 30 3.06 , 3.06 , Eq , 0.0
> 31 , 6 , , 30 3.06 , 3.06 , Eq , 0.0
> 29 , 0 , , 30 3.68 , 3.06 , New , 0.62
> 29 , 6 , , 30 3.7 , 3.06 , New , 0.64
> 32 , 0 , , 31 3.17 , 3.06 , New , 0.11
> 32 , 7 , , 31 3.12 , 3.06 , New , 0.06
> 30 , 0 , , 31 3.68 , 3.06 , New , 0.62
> 30 , 7 , , 31 3.68 , 3.06 , New , 0.62
>
> Results For Icelake memchr-evex
> size , algn , Pos , Cur T , New T , Win , Dif
> 2048 , 0 , , 32 4.94 , 4.26 , New , 0.68
> 256 , 1 , , 64 4.5 , 4.13 , New , 0.37
> 2048 , 0 , , 64 4.19 , 3.9 , New , 0.29
> 256 , 2 , , 64 4.19 , 3.87 , New , 0.32
> 2048 , 0 , , 128 4.96 , 4.53 , New , 0.43
> 256 , 3 , , 64 4.07 , 3.86 , New , 0.21
> 2048 , 0 , , 256 8.77 , 8.61 , New , 0.16
> 256 , 4 , , 64 4.08 , 3.87 , New , 0.21
> 2048 , 0 , , 512 12.22 , 11.67 , New , 0.55
> 256 , 5 , , 64 4.12 , 3.83 , New , 0.29
> 2048 , 0 , , 1024 20.06 , 18.09 , New , 1.97
> 256 , 6 , , 64 4.2 , 3.95 , New , 0.25
> 2048 , 0 , , 2048 33.83 , 30.62 , New , 3.21
> 256 , 7 , , 64 4.3 , 4.04 , New , 0.26
> 192 , 1 , , 32 4.2 , 3.71 , New , 0.49
> 256 , 1 , , 32 4.24 , 3.76 , New , 0.48
> 512 , 1 , , 32 4.29 , 3.74 , New , 0.55
> 192 , 2 , , 64 4.42 , 4.0 , New , 0.42
> 512 , 2 , , 64 4.17 , 3.83 , New , 0.34
> 192 , 3 , , 96 4.44 , 4.26 , New , 0.18
> 256 , 3 , , 96 4.45 , 4.14 , New , 0.31
> 512 , 3 , , 96 4.42 , 4.15 , New , 0.27
> 192 , 4 , , 128 4.93 , 4.45 , New , 0.48
> 256 , 4 , , 128 4.93 , 4.47 , New , 0.46
> 512 , 4 , , 128 4.95 , 4.47 , New , 0.48
> 192 , 5 , , 160 5.95 , 5.44 , New , 0.51
> 256 , 5 , , 160 5.59 , 5.47 , New , 0.12
> 512 , 5 , , 160 7.59 , 7.34 , New , 0.25
> 192 , 6 , , 192 6.53 , 6.08 , New , 0.45
> 256 , 6 , , 192 6.2 , 5.88 , New , 0.32
> 512 , 6 , , 192 7.53 , 7.62 , Cur , 0.09
> 192 , 7 , , 224 6.62 , 6.12 , New , 0.5
> 256 , 7 , , 224 6.79 , 6.51 , New , 0.28
> 512 , 7 , , 224 8.12 , 7.61 , New , 0.51
> 2 , 0 , , 1 2.5 , 2.54 , Cur , 0.04
> 2 , 1 , , 1 2.56 , 2.55 , New , 0.01
> 0 , 0 , , 1 2.57 , 3.12 , Cur , 0.55
> 0 , 1 , , 1 2.59 , 3.14 , Cur , 0.55
> 3 , 0 , , 2 2.62 , 2.63 , Cur , 0.01
> 3 , 2 , , 2 2.66 , 2.67 , Cur , 0.01
> 1 , 0 , , 2 3.24 , 2.72 , New , 0.52
> 1 , 2 , , 2 3.28 , 2.75 , New , 0.53
> 4 , 0 , , 3 2.78 , 2.8 , Cur , 0.02
> 4 , 3 , , 3 2.8 , 2.82 , Cur , 0.02
> 2 , 0 , , 3 3.38 , 2.86 , New , 0.52
> 2 , 3 , , 3 3.41 , 2.89 , New , 0.52
> 5 , 0 , , 4 2.88 , 2.91 , Cur , 0.03
> 5 , 4 , , 4 2.88 , 2.92 , Cur , 0.04
> 3 , 0 , , 4 3.48 , 2.93 , New , 0.55
> 3 , 4 , , 4 3.47 , 2.93 , New , 0.54
> 6 , 0 , , 5 2.95 , 2.94 , New , 0.01
> 6 , 5 , , 5 2.91 , 2.92 , Cur , 0.01
> 4 , 0 , , 5 3.47 , 2.9 , New , 0.57
> 4 , 5 , , 5 3.43 , 2.91 , New , 0.52
> 7 , 0 , , 6 2.87 , 2.9 , Cur , 0.03
> 7 , 6 , , 6 2.87 , 2.89 , Cur , 0.02
> 5 , 0 , , 6 3.44 , 2.88 , New , 0.56
> 5 , 6 , , 6 3.41 , 2.87 , New , 0.54
> 8 , 0 , , 7 2.86 , 2.87 , Cur , 0.01
> 8 , 7 , , 7 2.86 , 2.87 , Cur , 0.01
> 6 , 0 , , 7 3.43 , 2.87 , New , 0.56
> 6 , 7 , , 7 3.44 , 2.87 , New , 0.57
> 9 , 0 , , 8 2.86 , 2.88 , Cur , 0.02
> 7 , 0 , , 8 3.41 , 2.89 , New , 0.52
> 10 , 0 , , 9 2.83 , 2.87 , Cur , 0.04
> 10 , 1 , , 9 2.82 , 2.87 , Cur , 0.05
> 8 , 0 , , 9 3.4 , 2.89 , New , 0.51
> 8 , 1 , , 9 3.41 , 2.87 , New , 0.54
> 11 , 0 , , 10 2.83 , 2.88 , Cur , 0.05
> 11 , 2 , , 10 2.84 , 2.88 , Cur , 0.04
> 9 , 0 , , 10 3.41 , 2.87 , New , 0.54
> 9 , 2 , , 10 3.41 , 2.88 , New , 0.53
> 12 , 0 , , 11 2.83 , 2.89 , Cur , 0.06
> 12 , 3 , , 11 2.85 , 2.87 , Cur , 0.02
> 10 , 0 , , 11 3.41 , 2.87 , New , 0.54
> 10 , 3 , , 11 3.42 , 2.88 , New , 0.54
> 13 , 0 , , 12 2.86 , 2.87 , Cur , 0.01
> 13 , 4 , , 12 2.84 , 2.88 , Cur , 0.04
> 11 , 0 , , 12 3.43 , 2.87 , New , 0.56
> 11 , 4 , , 12 3.49 , 2.87 , New , 0.62
> 14 , 0 , , 13 2.85 , 2.86 , Cur , 0.01
> 14 , 5 , , 13 2.85 , 2.86 , Cur , 0.01
> 12 , 0 , , 13 3.41 , 2.86 , New , 0.55
> 12 , 5 , , 13 3.44 , 2.85 , New , 0.59
> 15 , 0 , , 14 2.83 , 2.87 , Cur , 0.04
> 15 , 6 , , 14 2.82 , 2.86 , Cur , 0.04
> 13 , 0 , , 14 3.41 , 2.86 , New , 0.55
> 13 , 6 , , 14 3.4 , 2.86 , New , 0.54
> 16 , 0 , , 15 2.84 , 2.86 , Cur , 0.02
> 16 , 7 , , 15 2.83 , 2.85 , Cur , 0.02
> 14 , 0 , , 15 3.41 , 2.85 , New , 0.56
> 14 , 7 , , 15 3.39 , 2.87 , New , 0.52
> 17 , 0 , , 16 2.83 , 2.87 , Cur , 0.04
> 15 , 0 , , 16 3.4 , 2.85 , New , 0.55
> 18 , 0 , , 17 2.83 , 2.86 , Cur , 0.03
> 18 , 1 , , 17 2.85 , 2.84 , New , 0.01
> 16 , 0 , , 17 3.41 , 2.85 , New , 0.56
> 16 , 1 , , 17 3.4 , 2.86 , New , 0.54
> 19 , 0 , , 18 2.8 , 2.84 , Cur , 0.04
> 19 , 2 , , 18 2.82 , 2.83 , Cur , 0.01
> 17 , 0 , , 18 3.39 , 2.86 , New , 0.53
> 17 , 2 , , 18 3.39 , 2.84 , New , 0.55
> 20 , 0 , , 19 2.85 , 2.87 , Cur , 0.02
> 20 , 3 , , 19 2.88 , 2.87 , New , 0.01
> 18 , 0 , , 19 3.38 , 2.85 , New , 0.53
> 18 , 3 , , 19 3.4 , 2.85 , New , 0.55
> 21 , 0 , , 20 2.83 , 2.85 , Cur , 0.02
> 21 , 4 , , 20 2.88 , 2.85 , New , 0.03
> 19 , 0 , , 20 3.39 , 2.84 , New , 0.55
> 19 , 4 , , 20 3.39 , 2.96 , New , 0.43
> 22 , 0 , , 21 2.84 , 2.9 , Cur , 0.06
> 22 , 5 , , 21 2.81 , 2.84 , Cur , 0.03
> 20 , 0 , , 21 3.41 , 2.81 , New , 0.6
> 20 , 5 , , 21 3.38 , 2.83 , New , 0.55
> 23 , 0 , , 22 2.8 , 2.82 , Cur , 0.02
> 23 , 6 , , 22 2.81 , 2.83 , Cur , 0.02
> 21 , 0 , , 22 3.35 , 2.81 , New , 0.54
> 21 , 6 , , 22 3.34 , 2.81 , New , 0.53
> 24 , 0 , , 23 2.77 , 2.84 , Cur , 0.07
> 24 , 7 , , 23 2.78 , 2.8 , Cur , 0.02
> 22 , 0 , , 23 3.34 , 2.79 , New , 0.55
> 22 , 7 , , 23 3.32 , 2.79 , New , 0.53
> 25 , 0 , , 24 2.77 , 2.8 , Cur , 0.03
> 23 , 0 , , 24 3.29 , 2.79 , New , 0.5
> 26 , 0 , , 25 2.73 , 2.78 , Cur , 0.05
> 26 , 1 , , 25 2.75 , 2.79 , Cur , 0.04
> 24 , 0 , , 25 3.27 , 2.79 , New , 0.48
> 24 , 1 , , 25 3.27 , 2.77 , New , 0.5
> 27 , 0 , , 26 2.72 , 2.78 , Cur , 0.06
> 27 , 2 , , 26 2.75 , 2.76 , Cur , 0.01
> 25 , 0 , , 26 3.29 , 2.73 , New , 0.56
> 25 , 2 , , 26 3.3 , 2.76 , New , 0.54
> 28 , 0 , , 27 2.75 , 2.79 , Cur , 0.04
> 28 , 3 , , 27 2.77 , 2.77 , Eq , 0.0
> 26 , 0 , , 27 3.28 , 2.78 , New , 0.5
> 26 , 3 , , 27 3.29 , 2.78 , New , 0.51
> 29 , 0 , , 28 2.74 , 2.76 , Cur , 0.02
> 29 , 4 , , 28 2.74 , 2.77 , Cur , 0.03
> 27 , 0 , , 28 3.3 , 2.76 , New , 0.54
> 27 , 4 , , 28 3.3 , 2.74 , New , 0.56
> 30 , 0 , , 29 2.72 , 2.76 , Cur , 0.04
> 30 , 5 , , 29 2.74 , 2.75 , Cur , 0.01
> 28 , 0 , , 29 3.25 , 2.73 , New , 0.52
> 28 , 5 , , 29 3.3 , 2.73 , New , 0.57
> 31 , 0 , , 30 2.73 , 2.77 , Cur , 0.04
> 31 , 6 , , 30 2.74 , 2.76 , Cur , 0.02
> 29 , 0 , , 30 3.25 , 2.73 , New , 0.52
> 29 , 6 , , 30 3.26 , 2.74 , New , 0.52
> 32 , 0 , , 31 2.73 , 2.74 , Cur , 0.01
> 32 , 7 , , 31 2.73 , 2.75 , Cur , 0.02
> 30 , 0 , , 31 3.24 , 2.72 , New , 0.52
> 30 , 7 , , 31 3.24 , 2.72 , New , 0.52
>
> For memchr-avx2 the improvements are more modest though again near
> universal. The improvement is most significant for medium sizes and
> small sizes with pos > size. For small sizes with pos < size and large
> sizes the two implementations perform roughly the same for large
> sizes.
>
> Results For Tigerlake memchr-avx2
> size , algn , Pos , Cur T , New T , Win , Dif
> 2048 , 0 , , 32 6.15 , 6.27 , Cur , 0.12
> 256 , 1 , , 64 6.21 , 6.03 , New , 0.18
> 2048 , 0 , , 64 6.07 , 5.95 , New , 0.12
> 256 , 2 , , 64 6.01 , 5.8 , New , 0.21
> 2048 , 0 , , 128 7.05 , 6.55 , New , 0.5
> 256 , 3 , , 64 6.14 , 5.83 , New , 0.31
> 2048 , 0 , , 256 11.78 , 11.78 , Eq , 0.0
> 256 , 4 , , 64 6.1 , 5.85 , New , 0.25
> 2048 , 0 , , 512 16.32 , 15.96 , New , 0.36
> 256 , 5 , , 64 6.1 , 5.77 , New , 0.33
> 2048 , 0 , , 1024 25.38 , 25.18 , New , 0.2
> 256 , 6 , , 64 6.08 , 5.88 , New , 0.2
> 2048 , 0 , , 2048 38.56 , 38.32 , New , 0.24
> 256 , 7 , , 64 5.93 , 5.68 , New , 0.25
> 192 , 1 , , 32 5.49 , 5.3 , New , 0.19
> 256 , 1 , , 32 5.5 , 5.28 , New , 0.22
> 512 , 1 , , 32 5.48 , 5.32 , New , 0.16
> 192 , 2 , , 64 6.1 , 5.73 , New , 0.37
> 512 , 2 , , 64 5.88 , 5.72 , New , 0.16
> 192 , 3 , , 96 6.31 , 5.93 , New , 0.38
> 256 , 3 , , 96 6.32 , 5.93 , New , 0.39
> 512 , 3 , , 96 6.2 , 5.94 , New , 0.26
> 192 , 4 , , 128 6.65 , 6.4 , New , 0.25
> 256 , 4 , , 128 6.6 , 6.37 , New , 0.23
> 512 , 4 , , 128 6.74 , 6.33 , New , 0.41
> 192 , 5 , , 160 7.78 , 7.4 , New , 0.38
> 256 , 5 , , 160 7.18 , 7.4 , Cur , 0.22
> 512 , 5 , , 160 9.81 , 9.44 , New , 0.37
> 192 , 6 , , 192 9.12 , 7.77 , New , 1.35
> 256 , 6 , , 192 7.97 , 7.66 , New , 0.31
> 512 , 6 , , 192 10.14 , 9.95 , New , 0.19
> 192 , 7 , , 224 8.96 , 7.78 , New , 1.18
> 256 , 7 , , 224 8.52 , 8.23 , New , 0.29
> 512 , 7 , , 224 10.33 , 9.98 , New , 0.35
> 2 , 0 , , 1 3.61 , 3.6 , New , 0.01
> 2 , 1 , , 1 3.6 , 3.6 , Eq , 0.0
> 0 , 0 , , 1 3.02 , 3.0 , New , 0.02
> 0 , 1 , , 1 3.0 , 3.0 , Eq , 0.0
> 3 , 0 , , 2 3.6 , 3.6 , Eq , 0.0
> 3 , 2 , , 2 3.61 , 3.6 , New , 0.01
> 1 , 0 , , 2 4.82 , 3.6 , New , 1.22
> 1 , 2 , , 2 4.81 , 3.6 , New , 1.21
> 4 , 0 , , 3 3.61 , 3.61 , Eq , 0.0
> 4 , 3 , , 3 3.62 , 3.61 , New , 0.01
> 2 , 0 , , 3 4.82 , 3.62 , New , 1.2
> 2 , 3 , , 3 4.83 , 3.63 , New , 1.2
> 5 , 0 , , 4 3.63 , 3.64 , Cur , 0.01
> 5 , 4 , , 4 3.63 , 3.62 , New , 0.01
> 3 , 0 , , 4 4.84 , 3.62 , New , 1.22
> 3 , 4 , , 4 4.84 , 3.64 , New , 1.2
> 6 , 0 , , 5 3.66 , 3.64 , New , 0.02
> 6 , 5 , , 5 3.65 , 3.62 , New , 0.03
> 4 , 0 , , 5 4.83 , 3.63 , New , 1.2
> 4 , 5 , , 5 4.85 , 3.64 , New , 1.21
> 7 , 0 , , 6 3.76 , 3.79 , Cur , 0.03
> 7 , 6 , , 6 3.76 , 3.72 , New , 0.04
> 5 , 0 , , 6 4.84 , 3.62 , New , 1.22
> 5 , 6 , , 6 4.85 , 3.64 , New , 1.21
> 8 , 0 , , 7 3.64 , 3.65 , Cur , 0.01
> 8 , 7 , , 7 3.65 , 3.65 , Eq , 0.0
> 6 , 0 , , 7 4.88 , 3.64 , New , 1.24
> 6 , 7 , , 7 4.87 , 3.65 , New , 1.22
> 9 , 0 , , 8 3.66 , 3.66 , Eq , 0.0
> 7 , 0 , , 8 4.89 , 3.66 , New , 1.23
> 10 , 0 , , 9 3.67 , 3.67 , Eq , 0.0
> 10 , 1 , , 9 3.67 , 3.67 , Eq , 0.0
> 8 , 0 , , 9 4.9 , 3.67 , New , 1.23
> 8 , 1 , , 9 4.9 , 3.67 , New , 1.23
> 11 , 0 , , 10 3.68 , 3.67 , New , 0.01
> 11 , 2 , , 10 3.69 , 3.67 , New , 0.02
> 9 , 0 , , 10 4.9 , 3.67 , New , 1.23
> 9 , 2 , , 10 4.9 , 3.67 , New , 1.23
> 12 , 0 , , 11 3.71 , 3.68 , New , 0.03
> 12 , 3 , , 11 3.71 , 3.67 , New , 0.04
> 10 , 0 , , 11 4.9 , 3.67 , New , 1.23
> 10 , 3 , , 11 4.9 , 3.67 , New , 1.23
> 13 , 0 , , 12 4.24 , 4.23 , New , 0.01
> 13 , 4 , , 12 4.23 , 4.23 , Eq , 0.0
> 11 , 0 , , 12 4.9 , 3.7 , New , 1.2
> 11 , 4 , , 12 4.9 , 3.73 , New , 1.17
> 14 , 0 , , 13 3.99 , 4.01 , Cur , 0.02
> 14 , 5 , , 13 3.98 , 3.98 , Eq , 0.0
> 12 , 0 , , 13 4.9 , 3.69 , New , 1.21
> 12 , 5 , , 13 4.9 , 3.69 , New , 1.21
> 15 , 0 , , 14 3.99 , 3.97 , New , 0.02
> 15 , 6 , , 14 4.0 , 4.0 , Eq , 0.0
> 13 , 0 , , 14 4.9 , 3.67 , New , 1.23
> 13 , 6 , , 14 4.9 , 3.67 , New , 1.23
> 16 , 0 , , 15 3.99 , 4.02 , Cur , 0.03
> 16 , 7 , , 15 4.01 , 3.96 , New , 0.05
> 14 , 0 , , 15 4.93 , 3.67 , New , 1.26
> 14 , 7 , , 15 4.92 , 3.67 , New , 1.25
> 17 , 0 , , 16 4.04 , 3.99 , New , 0.05
> 15 , 0 , , 16 5.42 , 4.22 , New , 1.2
> 18 , 0 , , 17 4.01 , 3.97 , New , 0.04
> 18 , 1 , , 17 3.99 , 3.98 , New , 0.01
> 16 , 0 , , 17 5.22 , 3.98 , New , 1.24
> 16 , 1 , , 17 5.19 , 3.98 , New , 1.21
> 19 , 0 , , 18 4.0 , 3.99 , New , 0.01
> 19 , 2 , , 18 4.03 , 3.97 , New , 0.06
> 17 , 0 , , 18 5.18 , 3.99 , New , 1.19
> 17 , 2 , , 18 5.18 , 3.98 , New , 1.2
> 20 , 0 , , 19 4.02 , 3.98 , New , 0.04
> 20 , 3 , , 19 4.0 , 3.98 , New , 0.02
> 18 , 0 , , 19 5.19 , 3.97 , New , 1.22
> 18 , 3 , , 19 5.21 , 3.98 , New , 1.23
> 21 , 0 , , 20 3.98 , 4.0 , Cur , 0.02
> 21 , 4 , , 20 4.0 , 4.0 , Eq , 0.0
> 19 , 0 , , 20 5.19 , 3.99 , New , 1.2
> 19 , 4 , , 20 5.17 , 3.99 , New , 1.18
> 22 , 0 , , 21 4.03 , 3.98 , New , 0.05
> 22 , 5 , , 21 4.01 , 3.95 , New , 0.06
> 20 , 0 , , 21 5.19 , 4.0 , New , 1.19
> 20 , 5 , , 21 5.21 , 3.99 , New , 1.22
> 23 , 0 , , 22 4.06 , 3.97 , New , 0.09
> 23 , 6 , , 22 4.02 , 3.98 , New , 0.04
> 21 , 0 , , 22 5.2 , 4.02 , New , 1.18
> 21 , 6 , , 22 5.22 , 4.0 , New , 1.22
> 24 , 0 , , 23 4.15 , 3.98 , New , 0.17
> 24 , 7 , , 23 4.0 , 4.01 , Cur , 0.01
> 22 , 0 , , 23 5.28 , 4.0 , New , 1.28
> 22 , 7 , , 23 5.22 , 3.99 , New , 1.23
> 25 , 0 , , 24 4.1 , 4.04 , New , 0.06
> 23 , 0 , , 24 5.23 , 4.04 , New , 1.19
> 26 , 0 , , 25 4.1 , 4.06 , New , 0.04
> 26 , 1 , , 25 4.07 , 3.99 , New , 0.08
> 24 , 0 , , 25 5.26 , 4.02 , New , 1.24
> 24 , 1 , , 25 5.21 , 4.0 , New , 1.21
> 27 , 0 , , 26 4.17 , 4.03 , New , 0.14
> 27 , 2 , , 26 4.09 , 4.03 , New , 0.06
> 25 , 0 , , 26 5.29 , 4.1 , New , 1.19
> 25 , 2 , , 26 5.25 , 4.0 , New , 1.25
> 28 , 0 , , 27 4.06 , 4.1 , Cur , 0.04
> 28 , 3 , , 27 4.09 , 4.04 , New , 0.05
> 26 , 0 , , 27 5.26 , 4.04 , New , 1.22
> 26 , 3 , , 27 5.28 , 4.01 , New , 1.27
> 29 , 0 , , 28 4.07 , 4.02 , New , 0.05
> 29 , 4 , , 28 4.07 , 4.05 , New , 0.02
> 27 , 0 , , 28 5.25 , 4.02 , New , 1.23
> 27 , 4 , , 28 5.25 , 4.03 , New , 1.22
> 30 , 0 , , 29 4.14 , 4.06 , New , 0.08
> 30 , 5 , , 29 4.08 , 4.04 , New , 0.04
> 28 , 0 , , 29 5.26 , 4.07 , New , 1.19
> 28 , 5 , , 29 5.28 , 4.04 , New , 1.24
> 31 , 0 , , 30 4.09 , 4.08 , New , 0.01
> 31 , 6 , , 30 4.1 , 4.08 , New , 0.02
> 29 , 0 , , 30 5.28 , 4.05 , New , 1.23
> 29 , 6 , , 30 5.24 , 4.07 , New , 1.17
> 32 , 0 , , 31 4.1 , 4.13 , Cur , 0.03
> 32 , 7 , , 31 4.16 , 4.09 , New , 0.07
> 30 , 0 , , 31 5.31 , 4.09 , New , 1.22
> 30 , 7 , , 31 5.28 , 4.08 , New , 1.2
>
> Results For Icelake memchr-avx2
> size , algn , Pos , Cur T , New T , Win , Dif
> 2048 , 0 , , 32 5.74 , 5.08 , New , 0.66
> 256 , 1 , , 64 5.16 , 4.93 , New , 0.23
> 2048 , 0 , , 64 4.86 , 4.69 , New , 0.17
> 256 , 2 , , 64 4.78 , 4.7 , New , 0.08
> 2048 , 0 , , 128 5.64 , 5.0 , New , 0.64
> 256 , 3 , , 64 4.64 , 4.59 , New , 0.05
> 2048 , 0 , , 256 9.07 , 9.17 , Cur , 0.1
> 256 , 4 , , 64 4.7 , 4.6 , New , 0.1
> 2048 , 0 , , 512 12.56 , 12.33 , New , 0.23
> 256 , 5 , , 64 4.72 , 4.61 , New , 0.11
> 2048 , 0 , , 1024 19.36 , 19.49 , Cur , 0.13
> 256 , 6 , , 64 4.82 , 4.69 , New , 0.13
> 2048 , 0 , , 2048 29.99 , 30.53 , Cur , 0.54
> 256 , 7 , , 64 4.9 , 4.85 , New , 0.05
> 192 , 1 , , 32 4.89 , 4.45 , New , 0.44
> 256 , 1 , , 32 4.93 , 4.44 , New , 0.49
> 512 , 1 , , 32 4.97 , 4.45 , New , 0.52
> 192 , 2 , , 64 5.04 , 4.65 , New , 0.39
> 512 , 2 , , 64 4.75 , 4.66 , New , 0.09
> 192 , 3 , , 96 5.14 , 4.66 , New , 0.48
> 256 , 3 , , 96 5.12 , 4.66 , New , 0.46
> 512 , 3 , , 96 5.13 , 4.62 , New , 0.51
> 192 , 4 , , 128 5.65 , 4.95 , New , 0.7
> 256 , 4 , , 128 5.63 , 4.95 , New , 0.68
> 512 , 4 , , 128 5.68 , 4.96 , New , 0.72
> 192 , 5 , , 160 6.1 , 5.84 , New , 0.26
> 256 , 5 , , 160 5.58 , 5.84 , Cur , 0.26
> 512 , 5 , , 160 7.95 , 7.74 , New , 0.21
> 192 , 6 , , 192 7.07 , 6.23 , New , 0.84
> 256 , 6 , , 192 6.34 , 6.09 , New , 0.25
> 512 , 6 , , 192 8.17 , 8.13 , New , 0.04
> 192 , 7 , , 224 7.06 , 6.23 , New , 0.83
> 256 , 7 , , 224 6.76 , 6.65 , New , 0.11
> 512 , 7 , , 224 8.29 , 8.08 , New , 0.21
> 2 , 0 , , 1 3.0 , 3.04 , Cur , 0.04
> 2 , 1 , , 1 3.06 , 3.07 , Cur , 0.01
> 0 , 0 , , 1 2.57 , 2.59 , Cur , 0.02
> 0 , 1 , , 1 2.6 , 2.61 , Cur , 0.01
> 3 , 0 , , 2 3.15 , 3.17 , Cur , 0.02
> 3 , 2 , , 2 3.19 , 3.21 , Cur , 0.02
> 1 , 0 , , 2 4.32 , 3.25 , New , 1.07
> 1 , 2 , , 2 4.36 , 3.31 , New , 1.05
> 4 , 0 , , 3 3.5 , 3.52 , Cur , 0.02
> 4 , 3 , , 3 3.52 , 3.54 , Cur , 0.02
> 2 , 0 , , 3 4.51 , 3.43 , New , 1.08
> 2 , 3 , , 3 4.56 , 3.47 , New , 1.09
> 5 , 0 , , 4 3.61 , 3.65 , Cur , 0.04
> 5 , 4 , , 4 3.63 , 3.67 , Cur , 0.04
> 3 , 0 , , 4 4.64 , 3.51 , New , 1.13
> 3 , 4 , , 4 4.7 , 3.51 , New , 1.19
> 6 , 0 , , 5 3.66 , 3.68 , Cur , 0.02
> 6 , 5 , , 5 3.69 , 3.65 , New , 0.04
> 4 , 0 , , 5 4.7 , 3.49 , New , 1.21
> 4 , 5 , , 5 4.58 , 3.48 , New , 1.1
> 7 , 0 , , 6 3.6 , 3.65 , Cur , 0.05
> 7 , 6 , , 6 3.59 , 3.64 , Cur , 0.05
> 5 , 0 , , 6 4.74 , 3.65 , New , 1.09
> 5 , 6 , , 6 4.73 , 3.64 , New , 1.09
> 8 , 0 , , 7 3.6 , 3.61 , Cur , 0.01
> 8 , 7 , , 7 3.6 , 3.61 , Cur , 0.01
> 6 , 0 , , 7 4.73 , 3.6 , New , 1.13
> 6 , 7 , , 7 4.73 , 3.62 , New , 1.11
> 9 , 0 , , 8 3.59 , 3.62 , Cur , 0.03
> 7 , 0 , , 8 4.72 , 3.64 , New , 1.08
> 10 , 0 , , 9 3.57 , 3.62 , Cur , 0.05
> 10 , 1 , , 9 3.56 , 3.61 , Cur , 0.05
> 8 , 0 , , 9 4.69 , 3.63 , New , 1.06
> 8 , 1 , , 9 4.71 , 3.61 , New , 1.1
> 11 , 0 , , 10 3.58 , 3.62 , Cur , 0.04
> 11 , 2 , , 10 3.59 , 3.63 , Cur , 0.04
> 9 , 0 , , 10 4.72 , 3.61 , New , 1.11
> 9 , 2 , , 10 4.7 , 3.61 , New , 1.09
> 12 , 0 , , 11 3.58 , 3.63 , Cur , 0.05
> 12 , 3 , , 11 3.58 , 3.62 , Cur , 0.04
> 10 , 0 , , 11 4.7 , 3.6 , New , 1.1
> 10 , 3 , , 11 4.73 , 3.64 , New , 1.09
> 13 , 0 , , 12 3.6 , 3.6 , Eq , 0.0
> 13 , 4 , , 12 3.57 , 3.62 , Cur , 0.05
> 11 , 0 , , 12 4.73 , 3.62 , New , 1.11
> 11 , 4 , , 12 4.79 , 3.61 , New , 1.18
> 14 , 0 , , 13 3.61 , 3.62 , Cur , 0.01
> 14 , 5 , , 13 3.59 , 3.59 , Eq , 0.0
> 12 , 0 , , 13 4.7 , 3.61 , New , 1.09
> 12 , 5 , , 13 4.75 , 3.58 , New , 1.17
> 15 , 0 , , 14 3.58 , 3.62 , Cur , 0.04
> 15 , 6 , , 14 3.59 , 3.62 , Cur , 0.03
> 13 , 0 , , 14 4.68 , 3.6 , New , 1.08
> 13 , 6 , , 14 4.68 , 3.63 , New , 1.05
> 16 , 0 , , 15 3.57 , 3.6 , Cur , 0.03
> 16 , 7 , , 15 3.55 , 3.59 , Cur , 0.04
> 14 , 0 , , 15 4.69 , 3.61 , New , 1.08
> 14 , 7 , , 15 4.69 , 3.61 , New , 1.08
> 17 , 0 , , 16 3.56 , 3.61 , Cur , 0.05
> 15 , 0 , , 16 4.71 , 3.58 , New , 1.13
> 18 , 0 , , 17 3.57 , 3.65 , Cur , 0.08
> 18 , 1 , , 17 3.58 , 3.59 , Cur , 0.01
> 16 , 0 , , 17 4.7 , 3.58 , New , 1.12
> 16 , 1 , , 17 4.68 , 3.59 , New , 1.09
> 19 , 0 , , 18 3.51 , 3.58 , Cur , 0.07
> 19 , 2 , , 18 3.55 , 3.58 , Cur , 0.03
> 17 , 0 , , 18 4.69 , 3.61 , New , 1.08
> 17 , 2 , , 18 4.68 , 3.61 , New , 1.07
> 20 , 0 , , 19 3.57 , 3.6 , Cur , 0.03
> 20 , 3 , , 19 3.59 , 3.59 , Eq , 0.0
> 18 , 0 , , 19 4.68 , 3.59 , New , 1.09
> 18 , 3 , , 19 4.67 , 3.57 , New , 1.1
> 21 , 0 , , 20 3.61 , 3.58 , New , 0.03
> 21 , 4 , , 20 3.62 , 3.6 , New , 0.02
> 19 , 0 , , 20 4.74 , 3.57 , New , 1.17
> 19 , 4 , , 20 4.69 , 3.7 , New , 0.99
> 22 , 0 , , 21 3.57 , 3.64 , Cur , 0.07
> 22 , 5 , , 21 3.55 , 3.6 , Cur , 0.05
> 20 , 0 , , 21 4.72 , 3.55 , New , 1.17
> 20 , 5 , , 21 4.66 , 3.55 , New , 1.11
> 23 , 0 , , 22 3.56 , 3.56 , Eq , 0.0
> 23 , 6 , , 22 3.54 , 3.56 , Cur , 0.02
> 21 , 0 , , 22 4.65 , 3.53 , New , 1.12
> 21 , 6 , , 22 4.62 , 3.56 , New , 1.06
> 24 , 0 , , 23 3.5 , 3.54 , Cur , 0.04
> 24 , 7 , , 23 3.52 , 3.53 , Cur , 0.01
> 22 , 0 , , 23 4.61 , 3.51 , New , 1.1
> 22 , 7 , , 23 4.6 , 3.51 , New , 1.09
> 25 , 0 , , 24 3.5 , 3.53 , Cur , 0.03
> 23 , 0 , , 24 4.54 , 3.5 , New , 1.04
> 26 , 0 , , 25 3.47 , 3.49 , Cur , 0.02
> 26 , 1 , , 25 3.46 , 3.51 , Cur , 0.05
> 24 , 0 , , 25 4.53 , 3.51 , New , 1.02
> 24 , 1 , , 25 4.51 , 3.51 , New , 1.0
> 27 , 0 , , 26 3.44 , 3.51 , Cur , 0.07
> 27 , 2 , , 26 3.51 , 3.52 , Cur , 0.01
> 25 , 0 , , 26 4.56 , 3.46 , New , 1.1
> 25 , 2 , , 26 4.55 , 3.47 , New , 1.08
> 28 , 0 , , 27 3.47 , 3.5 , Cur , 0.03
> 28 , 3 , , 27 3.48 , 3.47 , New , 0.01
> 26 , 0 , , 27 4.52 , 3.44 , New , 1.08
> 26 , 3 , , 27 4.55 , 3.46 , New , 1.09
> 29 , 0 , , 28 3.45 , 3.49 , Cur , 0.04
> 29 , 4 , , 28 3.5 , 3.5 , Eq , 0.0
> 27 , 0 , , 28 4.56 , 3.49 , New , 1.07
> 27 , 4 , , 28 4.5 , 3.49 , New , 1.01
> 30 , 0 , , 29 3.44 , 3.48 , Cur , 0.04
> 30 , 5 , , 29 3.46 , 3.47 , Cur , 0.01
> 28 , 0 , , 29 4.49 , 3.43 , New , 1.06
> 28 , 5 , , 29 4.57 , 3.45 , New , 1.12
> 31 , 0 , , 30 3.48 , 3.48 , Eq , 0.0
> 31 , 6 , , 30 3.46 , 3.49 , Cur , 0.03
> 29 , 0 , , 30 4.49 , 3.44 , New , 1.05
> 29 , 6 , , 30 4.53 , 3.44 , New , 1.09
> 32 , 0 , , 31 3.44 , 3.45 , Cur , 0.01
> 32 , 7 , , 31 3.46 , 3.51 , Cur , 0.05
> 30 , 0 , , 31 4.48 , 3.42 , New , 1.06
> 30 , 7 , , 31 4.48 , 3.44 , New , 1.04
>
>
> Results For Skylake memchr-avx2
> size , algn , Pos , Cur T , New T , Win , Dif
> 2048 , 0 , , 32 6.61 , 5.4 , New , 1.21
> 256 , 1 , , 64 6.52 , 5.68 , New , 0.84
> 2048 , 0 , , 64 6.03 , 5.47 , New , 0.56
> 256 , 2 , , 64 6.07 , 5.42 , New , 0.65
> 2048 , 0 , , 128 7.01 , 5.83 , New , 1.18
> 256 , 3 , , 64 6.24 , 5.68 , New , 0.56
> 2048 , 0 , , 256 11.03 , 9.86 , New , 1.17
> 256 , 4 , , 64 6.17 , 5.49 , New , 0.68
> 2048 , 0 , , 512 14.11 , 13.41 , New , 0.7
> 256 , 5 , , 64 6.03 , 5.45 , New , 0.58
> 2048 , 0 , , 1024 19.82 , 19.92 , Cur , 0.1
> 256 , 6 , , 64 6.14 , 5.7 , New , 0.44
> 2048 , 0 , , 2048 30.9 , 30.59 , New , 0.31
> 256 , 7 , , 64 6.05 , 5.64 , New , 0.41
> 192 , 1 , , 32 5.6 , 4.89 , New , 0.71
> 256 , 1 , , 32 5.59 , 5.07 , New , 0.52
> 512 , 1 , , 32 5.58 , 4.93 , New , 0.65
> 192 , 2 , , 64 6.14 , 5.46 , New , 0.68
> 512 , 2 , , 64 5.95 , 5.38 , New , 0.57
> 192 , 3 , , 96 6.6 , 5.74 , New , 0.86
> 256 , 3 , , 96 6.48 , 5.37 , New , 1.11
> 512 , 3 , , 96 6.56 , 5.44 , New , 1.12
> 192 , 4 , , 128 7.04 , 6.02 , New , 1.02
> 256 , 4 , , 128 6.96 , 5.89 , New , 1.07
> 512 , 4 , , 128 6.97 , 5.99 , New , 0.98
> 192 , 5 , , 160 8.49 , 7.07 , New , 1.42
> 256 , 5 , , 160 8.1 , 6.96 , New , 1.14
> 512 , 5 , , 160 10.48 , 9.14 , New , 1.34
> 192 , 6 , , 192 8.46 , 8.52 , Cur , 0.06
> 256 , 6 , , 192 8.53 , 7.58 , New , 0.95
> 512 , 6 , , 192 10.88 , 9.06 , New , 1.82
> 192 , 7 , , 224 8.59 , 8.35 , New , 0.24
> 256 , 7 , , 224 8.86 , 7.91 , New , 0.95
> 512 , 7 , , 224 10.89 , 8.98 , New , 1.91
> 2 , 0 , , 1 4.28 , 3.62 , New , 0.66
> 2 , 1 , , 1 4.32 , 3.75 , New , 0.57
> 0 , 0 , , 1 3.76 , 3.24 , New , 0.52
> 0 , 1 , , 1 3.7 , 3.19 , New , 0.51
> 3 , 0 , , 2 4.16 , 3.67 , New , 0.49
> 3 , 2 , , 2 4.21 , 3.68 , New , 0.53
> 1 , 0 , , 2 4.25 , 3.74 , New , 0.51
> 1 , 2 , , 2 4.4 , 3.82 , New , 0.58
> 4 , 0 , , 3 4.43 , 3.88 , New , 0.55
> 4 , 3 , , 3 4.34 , 3.8 , New , 0.54
> 2 , 0 , , 3 4.33 , 3.79 , New , 0.54
> 2 , 3 , , 3 4.37 , 3.84 , New , 0.53
> 5 , 0 , , 4 4.45 , 3.87 , New , 0.58
> 5 , 4 , , 4 4.41 , 3.84 , New , 0.57
> 3 , 0 , , 4 4.34 , 3.83 , New , 0.51
> 3 , 4 , , 4 4.35 , 3.82 , New , 0.53
> 6 , 0 , , 5 4.41 , 3.88 , New , 0.53
> 6 , 5 , , 5 4.41 , 3.88 , New , 0.53
> 4 , 0 , , 5 4.35 , 3.84 , New , 0.51
> 4 , 5 , , 5 4.37 , 3.85 , New , 0.52
> 7 , 0 , , 6 4.4 , 3.84 , New , 0.56
> 7 , 6 , , 6 4.39 , 3.83 , New , 0.56
> 5 , 0 , , 6 4.37 , 3.85 , New , 0.52
> 5 , 6 , , 6 4.4 , 3.86 , New , 0.54
> 8 , 0 , , 7 4.39 , 3.88 , New , 0.51
> 8 , 7 , , 7 4.4 , 3.83 , New , 0.57
> 6 , 0 , , 7 4.39 , 3.85 , New , 0.54
> 6 , 7 , , 7 4.38 , 3.87 , New , 0.51
> 9 , 0 , , 8 4.47 , 3.96 , New , 0.51
> 7 , 0 , , 8 4.37 , 3.85 , New , 0.52
> 10 , 0 , , 9 4.61 , 4.08 , New , 0.53
> 10 , 1 , , 9 4.61 , 4.09 , New , 0.52
> 8 , 0 , , 9 4.37 , 3.85 , New , 0.52
> 8 , 1 , , 9 4.37 , 3.85 , New , 0.52
> 11 , 0 , , 10 4.68 , 4.06 , New , 0.62
> 11 , 2 , , 10 4.56 , 4.1 , New , 0.46
> 9 , 0 , , 10 4.36 , 3.83 , New , 0.53
> 9 , 2 , , 10 4.37 , 3.83 , New , 0.54
> 12 , 0 , , 11 4.62 , 4.05 , New , 0.57
> 12 , 3 , , 11 4.63 , 4.06 , New , 0.57
> 10 , 0 , , 11 4.38 , 3.86 , New , 0.52
> 10 , 3 , , 11 4.41 , 3.86 , New , 0.55
> 13 , 0 , , 12 4.57 , 4.08 , New , 0.49
> 13 , 4 , , 12 4.59 , 4.12 , New , 0.47
> 11 , 0 , , 12 4.45 , 4.0 , New , 0.45
> 11 , 4 , , 12 4.51 , 4.04 , New , 0.47
> 14 , 0 , , 13 4.64 , 4.16 , New , 0.48
> 14 , 5 , , 13 4.67 , 4.1 , New , 0.57
> 12 , 0 , , 13 4.58 , 4.08 , New , 0.5
> 12 , 5 , , 13 4.6 , 4.1 , New , 0.5
> 15 , 0 , , 14 4.61 , 4.05 , New , 0.56
> 15 , 6 , , 14 4.59 , 4.06 , New , 0.53
> 13 , 0 , , 14 4.57 , 4.06 , New , 0.51
> 13 , 6 , , 14 4.57 , 4.05 , New , 0.52
> 16 , 0 , , 15 4.62 , 4.05 , New , 0.57
> 16 , 7 , , 15 4.63 , 4.06 , New , 0.57
> 14 , 0 , , 15 4.61 , 4.06 , New , 0.55
> 14 , 7 , , 15 4.59 , 4.05 , New , 0.54
> 17 , 0 , , 16 4.58 , 4.08 , New , 0.5
> 15 , 0 , , 16 4.64 , 4.06 , New , 0.58
> 18 , 0 , , 17 4.56 , 4.17 , New , 0.39
> 18 , 1 , , 17 4.59 , 4.09 , New , 0.5
> 16 , 0 , , 17 4.59 , 4.07 , New , 0.52
> 16 , 1 , , 17 4.58 , 4.04 , New , 0.54
> 19 , 0 , , 18 4.61 , 4.05 , New , 0.56
> 19 , 2 , , 18 4.6 , 4.08 , New , 0.52
> 17 , 0 , , 18 4.64 , 4.11 , New , 0.53
> 17 , 2 , , 18 4.56 , 4.13 , New , 0.43
> 20 , 0 , , 19 4.77 , 4.3 , New , 0.47
> 20 , 3 , , 19 4.6 , 4.14 , New , 0.46
> 18 , 0 , , 19 4.72 , 4.02 , New , 0.7
> 18 , 3 , , 19 4.53 , 4.01 , New , 0.52
> 21 , 0 , , 20 4.66 , 4.26 , New , 0.4
> 21 , 4 , , 20 4.74 , 4.07 , New , 0.67
> 19 , 0 , , 20 4.62 , 4.12 , New , 0.5
> 19 , 4 , , 20 4.57 , 4.04 , New , 0.53
> 22 , 0 , , 21 4.61 , 4.13 , New , 0.48
> 22 , 5 , , 21 4.64 , 4.08 , New , 0.56
> 20 , 0 , , 21 4.49 , 4.01 , New , 0.48
> 20 , 5 , , 21 4.58 , 4.06 , New , 0.52
> 23 , 0 , , 22 4.62 , 4.13 , New , 0.49
> 23 , 6 , , 22 4.72 , 4.27 , New , 0.45
> 21 , 0 , , 22 4.65 , 3.97 , New , 0.68
> 21 , 6 , , 22 4.5 , 4.02 , New , 0.48
> 24 , 0 , , 23 4.78 , 4.07 , New , 0.71
> 24 , 7 , , 23 4.67 , 4.23 , New , 0.44
> 22 , 0 , , 23 4.49 , 3.99 , New , 0.5
> 22 , 7 , , 23 4.56 , 4.03 , New , 0.53
> 25 , 0 , , 24 4.6 , 4.15 , New , 0.45
> 23 , 0 , , 24 4.57 , 4.06 , New , 0.51
> 26 , 0 , , 25 4.54 , 4.14 , New , 0.4
> 26 , 1 , , 25 4.72 , 4.1 , New , 0.62
> 24 , 0 , , 25 4.52 , 4.13 , New , 0.39
> 24 , 1 , , 25 4.55 , 4.0 , New , 0.55
> 27 , 0 , , 26 4.51 , 4.06 , New , 0.45
> 27 , 2 , , 26 4.53 , 4.16 , New , 0.37
> 25 , 0 , , 26 4.59 , 4.09 , New , 0.5
> 25 , 2 , , 26 4.55 , 4.01 , New , 0.54
> 28 , 0 , , 27 4.59 , 3.99 , New , 0.6
> 28 , 3 , , 27 4.57 , 3.95 , New , 0.62
> 26 , 0 , , 27 4.55 , 4.15 , New , 0.4
> 26 , 3 , , 27 4.57 , 3.99 , New , 0.58
> 29 , 0 , , 28 4.41 , 4.03 , New , 0.38
> 29 , 4 , , 28 4.59 , 4.02 , New , 0.57
> 27 , 0 , , 28 4.63 , 4.08 , New , 0.55
> 27 , 4 , , 28 4.44 , 4.02 , New , 0.42
> 30 , 0 , , 29 4.53 , 3.93 , New , 0.6
> 30 , 5 , , 29 4.55 , 3.88 , New , 0.67
> 28 , 0 , , 29 4.49 , 3.9 , New , 0.59
> 28 , 5 , , 29 4.44 , 3.94 , New , 0.5
> 31 , 0 , , 30 4.41 , 3.85 , New , 0.56
> 31 , 6 , , 30 4.48 , 3.86 , New , 0.62
> 29 , 0 , , 30 4.55 , 3.94 , New , 0.61
> 29 , 6 , , 30 4.32 , 3.95 , New , 0.37
> 32 , 0 , , 31 4.36 , 3.91 , New , 0.45
> 32 , 7 , , 31 4.37 , 3.89 , New , 0.48
> 30 , 0 , , 31 4.65 , 3.9 , New , 0.75
> 30 , 7 , , 31 4.42 , 3.93 , New , 0.49
>
> sysdeps/x86_64/multiarch/memchr-evex.S | 580 +++++++++++++++----------
> 1 file changed, 349 insertions(+), 231 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> index 6dd5d67b90..65c16ef8a4 100644
> --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> @@ -26,14 +26,28 @@
>
> # ifdef USE_AS_WMEMCHR
> # define VPBROADCAST vpbroadcastd
> -# define VPCMP vpcmpd
> -# define SHIFT_REG r8d
> +# define VPMINU vpminud
> +# define VPCMP vpcmpd
> +# define VPCMPEQ vpcmpeqd
> +# define CHAR_SIZE 4
> # else
> # define VPBROADCAST vpbroadcastb
> -# define VPCMP vpcmpb
> -# define SHIFT_REG ecx
> +# define VPMINU vpminub
> +# define VPCMP vpcmpb
> +# define VPCMPEQ vpcmpeqb
> +# define CHAR_SIZE 1
> # endif
>
> +# ifdef USE_AS_RAWMEMCHR
> +# define RAW_PTR_REG rcx
> +# define ALGN_PTR_REG rdi
> +# else
> +# define RAW_PTR_REG rdi
> +# define ALGN_PTR_REG rcx
> +# endif
> +
> +#define XZERO xmm23
Add a space before define. Rename XZERO to XMMZERO.
> +#define YZERO ymm23
Add a space before define. Rename YZERO to YMMZERO.
> # define XMMMATCH xmm16
> # define YMMMATCH ymm16
> # define YMM1 ymm17
> @@ -44,18 +58,16 @@
> # define YMM6 ymm22
>
> # define VEC_SIZE 32
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> +# define PAGE_SIZE 4096
>
> .section .text.evex,"ax",@progbits
> -ENTRY (MEMCHR)
> +ENTRY(MEMCHR)
No need for this change.
> # ifndef USE_AS_RAWMEMCHR
> /* Check for zero length. */
> test %RDX_LP, %RDX_LP
> jz L(zero)
> -# endif
> - movl %edi, %ecx
> -# ifdef USE_AS_WMEMCHR
> - shl $2, %RDX_LP
> -# else
> +
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> movl %edx, %edx
> @@ -63,319 +75,425 @@ ENTRY (MEMCHR)
> # endif
> /* Broadcast CHAR to YMMMATCH. */
> VPBROADCAST %esi, %YMMMATCH
> - /* Check if we may cross page boundary with one vector load. */
> - andl $(2 * VEC_SIZE - 1), %ecx
> - cmpl $VEC_SIZE, %ecx
> - ja L(cros_page_boundary)
> + /* Check if we may cross page boundary with one
> + vector load. */
Fit comments to 72 columns.
> + movl %edi, %eax
> + andl $(PAGE_SIZE - 1), %eax
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> + ja L(cross_page_boundary)
>
> /* Check the first VEC_SIZE bytes. */
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> -
> + VPCMP $0, (%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> # ifndef USE_AS_RAWMEMCHR
> - jnz L(first_vec_x0_check)
> - /* Adjust length and check the end of data. */
> - subq $VEC_SIZE, %rdx
> - jbe L(zero)
> + /* If length < CHAR_PER_VEC handle special. */
> + cmpq $CHAR_PER_VEC, %rdx
> + jbe L(first_vec_x0)
> +# endif
> + testl %eax, %eax
> + jz L(aligned_more)
> + tzcntl %eax, %eax
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Multiply bytes by CHAR_SIZE to get the
> + wchar_t count. */
Fit comments to 72 columns.
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> # else
> - jnz L(first_vec_x0)
> + addq %rdi, %rax
> # endif
> -
> - /* Align data for aligned loads in the loop. */
> - addq $VEC_SIZE, %rdi
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> + ret
>
> # ifndef USE_AS_RAWMEMCHR
> - /* Adjust length. */
> - addq %rcx, %rdx
> -
> - subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> -# endif
> - jmp L(more_4x_vec)
> +L(zero):
> + xorl %eax, %eax
> + ret
>
> + .p2align 5
> +L(first_vec_x0):
> + /* Check if first match was before length. */
> + tzcntl %eax, %eax
> + xorl %ecx, %ecx
> + cmpl %eax, %edx
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> + cmovle %rcx, %rax
> + ret
> +# else
> + /* NB: first_vec_x0 is 17 bytes which will leave
> + cross_page_boundary (which is relatively cold) close
> + enough to ideal alignment. So only realign
> + L(cross_page_boundary) if rawmemchr. */
Fit comments to 72 columns.
> .p2align 4
> -L(cros_page_boundary):
> - andl $(VEC_SIZE - 1), %ecx
> +# endif
> +L(cross_page_boundary):
> + /* Save pointer before aligning as its original
> + value is necessary for computer return address if byte is
> + found or adjusting length if it is not and this is
> + memchr. */
Fit comments to 72 columns.
> + movq %rdi, %rcx
> + /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx
> + for memchr and rdi for rawmemchr. */
Fit comments to 72 columns.
> + andq $-VEC_SIZE, %ALGN_PTR_REG
> + VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
> + kmovd %k0, %r8d
> # ifdef USE_AS_WMEMCHR
> - /* NB: Divide shift count by 4 since each bit in K1 represent 4
> - bytes. */
> - movl %ecx, %SHIFT_REG
> - sarl $2, %SHIFT_REG
> + /* NB: Divide shift count by 4 since each bit in
> + K0 represent 4 bytes. */
> + sarl $2, %eax
> +# endif
> +# ifndef USE_AS_RAWMEMCHR
> + movl $(PAGE_SIZE / CHAR_SIZE), %esi
> + subl %eax, %esi
> # endif
> - andq $-VEC_SIZE, %rdi
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - /* Remove the leading bytes. */
> - sarxl %SHIFT_REG, %eax, %eax
> - testl %eax, %eax
> - jz L(aligned_more)
> - tzcntl %eax, %eax
> # ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - sall $2, %eax
> + andl $(CHAR_PER_VEC - 1), %eax
> # endif
> + /* Remove the leading bytes. */
> + sarxl %eax, %r8d, %eax
> # ifndef USE_AS_RAWMEMCHR
> /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> + cmpq %rsi, %rdx
> + jbe L(first_vec_x0)
> +# endif
> + testl %eax, %eax
> + jz L(cross_page_continue)
> + tzcntl %eax, %eax
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Multiply bytes by CHAR_SIZE to get the
> + wchar_t count. */
> + leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
> +# else
> + addq %RAW_PTR_REG, %rax
> # endif
> - addq %rdi, %rax
> - addq %rcx, %rax
> ret
>
> .p2align 4
> -L(aligned_more):
> -# ifndef USE_AS_RAWMEMCHR
> - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> - overflow. */
> - negq %rcx
> - addq $VEC_SIZE, %rcx
> +L(first_vec_x1):
> + tzcntl %eax, %eax
> + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> - /* Check the end of data. */
> - subq %rcx, %rdx
> - jbe L(zero)
> -# endif
> + .p2align 4
> +L(first_vec_x2):
> + tzcntl %eax, %eax
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> - addq $VEC_SIZE, %rdi
> + .p2align 4
> +L(first_vec_x3):
> + tzcntl %eax, %eax
> + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
> +
> + .p2align 4
> +L(first_vec_x4):
> + tzcntl %eax, %eax
> + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
> +
> + .p2align 5
> +L(aligned_more):
> + /* Check the first 4 * VEC_SIZE. Only one
> + VEC_SIZE at a time since data is only aligned to
> + VEC_SIZE. */
Fit comments to 72 columns.
>
> # ifndef USE_AS_RAWMEMCHR
> - subq $(VEC_SIZE * 4), %rdx
> + /* Align data to VEC_SIZE. */
> +L(cross_page_continue):
> + xorl %ecx, %ecx
> + subl %edi, %ecx
> + andq $-VEC_SIZE, %rdi
> + /* esi is for adjusting length to see if near the
> + end. */
Fit comments to 72 columns.
> + leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Divide bytes by 4 to get the wchar_t
> + count. */
> + sarl $2, %esi
> +# endif
> +# else
> + andq $-VEC_SIZE, %rdi
> +L(cross_page_continue):
> +# endif
> + /* Load first VEC regardless. */
> + VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> +# ifndef USE_AS_RAWMEMCHR
> + /* Adjust length. If near end handle specially.
> + */
Fit comments to 72 columns.
> + subq %rsi, %rdx
> jbe L(last_4x_vec_or_less)
> # endif
> -
> -L(more_4x_vec):
> - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> - since data is only aligned to VEC_SIZE. */
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
> -
> - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> testl %eax, %eax
> jnz L(first_vec_x1)
>
> - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> testl %eax, %eax
> jnz L(first_vec_x2)
>
> - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> testl %eax, %eax
> jnz L(first_vec_x3)
>
> - addq $(VEC_SIZE * 4), %rdi
> + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + testl %eax, %eax
> + jnz L(first_vec_x4)
> +
>
> # ifndef USE_AS_RAWMEMCHR
> - subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> -# endif
> + /* Check if at last CHAR_PER_VEC * 4 length. */
> + subq $(CHAR_PER_VEC * 4), %rdx
> + jbe L(last_4x_vec_or_less_cmpeq)
> + addq $VEC_SIZE, %rdi
>
> - /* Align data to 4 * VEC_SIZE. */
> - movq %rdi, %rcx
> - andl $(4 * VEC_SIZE - 1), %ecx
> + /* Align data to VEC_SIZE * 4 for the loop and
> + readjust length. */
Fit comments to 72 columns.
> +# ifdef USE_AS_WMEMCHR
> + movl %edi, %ecx
> andq $-(4 * VEC_SIZE), %rdi
> -
> -# ifndef USE_AS_RAWMEMCHR
> - /* Adjust length. */
> + andl $(VEC_SIZE * 4 - 1), %ecx
> + /* NB: Divide bytes by 4 to get the wchar_t
> + count. */
Fit comments to 72 columns.
> + sarl $2, %ecx
> addq %rcx, %rdx
> +# else
> + addq %rdi, %rdx
> + andq $-(4 * VEC_SIZE), %rdi
> + subq %rdi, %rdx
> +# endif
> +# else
> + addq $VEC_SIZE, %rdi
> + andq $-(4 * VEC_SIZE), %rdi
> # endif
>
> + vpxorq %XZERO, %XZERO, %XZERO
> +
> + /* Compare 4 * VEC at a time forward. */
> .p2align 4
> L(loop_4x_vec):
> - /* Compare 4 * VEC at a time forward. */
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> - kord %k1, %k2, %k5
> - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> -
> - kord %k3, %k4, %k6
> - kortestd %k5, %k6
> - jnz L(4x_vec_end)
> -
> - addq $(VEC_SIZE * 4), %rdi
> -
> + /* It would be possible to save some instructions
> + using 4x VPCMP but bottleneck on port 5 makes it not woth
> + it. */
Fit comments to 72 columns.
> + VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
> + /* xor will set bytes match esi to zero. */
> + vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
> + vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
> + VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
> + /* Reduce VEC2 / VEC3 with min and VEC1 with zero
> + mask. */
Fit comments to 72 columns.
> + VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
> + VPCMP $0, %YMM3, %YZERO, %k2
> # ifdef USE_AS_RAWMEMCHR
> - jmp L(loop_4x_vec)
> + subq $-(VEC_SIZE * 4), %rdi
> + kortestd %k2, %k3
> + jz L(loop_4x_vec)
> # else
> - subq $(VEC_SIZE * 4), %rdx
> - ja L(loop_4x_vec)
> + kortestd %k2, %k3
> + jnz L(loop_4x_vec_end)
>
> -L(last_4x_vec_or_less):
> - /* Less than 4 * VEC and aligned to VEC_SIZE. */
> - addl $(VEC_SIZE * 2), %edx
> - jle L(last_2x_vec)
> + subq $-(VEC_SIZE * 4), %rdi
>
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
> + subq $(CHAR_PER_VEC * 4), %rdx
> + ja L(loop_4x_vec)
>
> - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> + /* Fall through into less than 4 remaining
> + vectors of length case. */
Fit comments to 72 columns.
> + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + addq $(VEC_SIZE * 3), %rdi
> + .p2align 4
> +L(last_4x_vec_or_less):
> + /* Check if first VEC contained match. */
> testl %eax, %eax
> - jnz L(first_vec_x1)
> + jnz L(first_vec_x1_check)
>
> - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> + /* If remaining length > CHAR_PER_VEC * 2. */
> + addl $(CHAR_PER_VEC * 2), %edx
> + jg L(last_4x_vec)
>
> - jnz L(first_vec_x2_check)
> - subl $VEC_SIZE, %edx
> - jle L(zero)
> +L(last_2x_vec):
> + /* If remaining length < CHAR_PER_VEC. */
> + addl $CHAR_PER_VEC, %edx
> + jle L(zero_end)
> +
> + /* Check VEC2 and compare any match with
> + remaining length. */
Fit comments to 72 columns.
> + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + tzcntl %eax, %eax
> + cmpl %eax, %edx
> + jbe L(set_zero_end)
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +L(zero_end):
> + ret
>
> - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
>
> - jnz L(first_vec_x3_check)
> + .p2align 4
> +L(first_vec_x1_check):
> + tzcntl %eax, %eax
> + /* Adjust length. */
> + subl $-(CHAR_PER_VEC * 4), %edx
> + /* Check if match within remaining length. */
> + cmpl %eax, %edx
> + jbe L(set_zero_end)
> + /* NB: Multiply bytes by CHAR_SIZE to get the
> + wchar_t count. */
Fit comments to 72 columns.
> + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> + ret
> +L(set_zero_end):
> xorl %eax, %eax
> ret
>
> .p2align 4
> -L(last_2x_vec):
> - addl $(VEC_SIZE * 2), %edx
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> +L(loop_4x_vec_end):
> +# endif
> + /* rawmemchr will fall through into this if match
> + was found in loop. */
Fit comments to 72 columns.
> +
> + /* k1 has not of matches with VEC1. */
> kmovd %k1, %eax
> - testl %eax, %eax
> +# ifdef USE_AS_WMEMCHR
> + subl $((1 << CHAR_PER_VEC) - 1), %eax
> +# else
> + incl %eax
> +# endif
> + jnz L(last_vec_x1_return)
>
> - jnz L(first_vec_x0_check)
> - subl $VEC_SIZE, %edx
> - jle L(zero)
> + VPCMP $0, %YMM2, %YZERO, %k0
> + kmovd %k0, %eax
> + testl %eax, %eax
> + jnz L(last_vec_x2_return)
>
> - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> + kmovd %k2, %eax
> testl %eax, %eax
> - jnz L(first_vec_x1_check)
> - xorl %eax, %eax
> - ret
> + jnz L(last_vec_x3_return)
>
> - .p2align 4
> -L(first_vec_x0_check):
> + kmovd %k3, %eax
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - sall $2, %eax
> +# ifdef USE_AS_RAWMEMCHR
> + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +# else
> + leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
> # endif
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq %rdi, %rax
> ret
>
> .p2align 4
> -L(first_vec_x1_check):
> +L(last_vec_x1_return):
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - sall $2, %eax
> -# endif
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $VEC_SIZE, %rax
> +# ifdef USE_AS_RAWMEMCHR
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Multiply bytes by CHAR_SIZE to get the
> + wchar_t count. */
Fit comments to 72 columns.
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> addq %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(first_vec_x2_check):
> - tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - sall $2, %eax
> +# endif
> +# else
> + /* NB: Multiply bytes by CHAR_SIZE to get the
> + wchar_t count. */
Fit comments to 72 columns.
> + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> # endif
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $(VEC_SIZE * 2), %rax
> - addq %rdi, %rax
> ret
>
> .p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x2_return):
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - sall $2, %eax
> +# ifdef USE_AS_RAWMEMCHR
> + /* NB: Multiply bytes by CHAR_SIZE to get the
> + wchar_t count. */
> + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +# else
> + /* NB: Multiply bytes by CHAR_SIZE to get the
> + wchar_t count. */
> + leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
> # endif
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $(VEC_SIZE * 3), %rax
> - addq %rdi, %rax
> ret
>
> .p2align 4
> -L(zero):
> - xorl %eax, %eax
> - ret
> -# endif
> -
> - .p2align 4
> -L(first_vec_x0):
> +L(last_vec_x3_return):
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq (%rdi, %rax, 4), %rax
> +# ifdef USE_AS_RAWMEMCHR
> + /* NB: Multiply bytes by CHAR_SIZE to get the
> + wchar_t count. */
Fit comments to 72 columns.
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> # else
> - addq %rdi, %rax
> + /* NB: Multiply bytes by CHAR_SIZE to get the
> + wchar_t count. */
Fit comments to 72 columns.
> + leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
> # endif
> ret
>
> +
> +# ifndef USE_AS_RAWMEMCHR
> +L(last_4x_vec_or_less_cmpeq):
> + VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + subq $-(VEC_SIZE * 4), %rdi
> + /* Check first VEC regardless. */
> + testl %eax, %eax
> + jnz L(first_vec_x1_check)
> +
> + /* If remaining length <= CHAR_PER_VEC * 2. */
> + addl $(CHAR_PER_VEC * 2), %edx
> + jle L(last_2x_vec)
> +
> .p2align 4
> -L(first_vec_x1):
> +L(last_4x_vec):
> + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + testl %eax, %eax
> + jnz L(last_vec_x2)
> +
> +
> + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + /* Create mask for possible matches within
> + remaining length. */
Fit comments to 72 columns.
> +# ifdef USE_AS_WMEMCHR
> + movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
> + bzhil %edx, %ecx, %ecx
> +# else
> + movq $-1, %rcx
> + bzhiq %rdx, %rcx, %rcx
> +# endif
> + /* Test matches in data against length match. */
> + andl %ecx, %eax
> + jnz L(last_vec_x3)
> +
> + /* if remaining length <= CHAR_PER_VEC * 3 (Note
> + this is after remaining length was found to be >
> + CHAR_PER_VEC * 2. */
Fit comments to 72 columns.
> + subl $CHAR_PER_VEC, %edx
> + jbe L(zero_end2)
> +
> +
> + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + /* Shift remaining length mask for last VEC. */
> +# ifdef USE_AS_WMEMCHR
> + shrl $CHAR_PER_VEC, %ecx
> +# else
> + shrq $CHAR_PER_VEC, %rcx
> +# endif
> + andl %ecx, %eax
> + jz L(zero_end2)
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq VEC_SIZE(%rdi, %rax, 4), %rax
> -# else
> - addq $VEC_SIZE, %rax
> - addq %rdi, %rax
> -# endif
> + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> +L(zero_end2):
> ret
>
> - .p2align 4
> -L(first_vec_x2):
> +L(last_vec_x2):
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
> -# else
> - addq $(VEC_SIZE * 2), %rax
> - addq %rdi, %rax
> -# endif
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> ret
>
> .p2align 4
> -L(4x_vec_end):
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
> - kmovd %k2, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x1)
> - kmovd %k3, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x2)
> - kmovd %k4, %eax
> - testl %eax, %eax
> -L(first_vec_x3):
> +L(last_vec_x3):
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
> -# else
> - addq $(VEC_SIZE * 3), %rax
> - addq %rdi, %rax
> -# endif
> + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> ret
> +# endif
>
> -END (MEMCHR)
> +END(MEMCHR)
No need for this change.
> #endif
> --
> 2.29.2
>
Thanks.
H.J.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v1 1/3] Bench: Expand bench-memchr.c
2021-05-03 17:17 ` [PATCH v1 1/3] Bench: Expand bench-memchr.c H.J. Lu
@ 2021-05-03 19:51 ` Noah Goldstein
2021-05-03 20:59 ` H.J. Lu
0 siblings, 1 reply; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 19:51 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Mon, May 3, 2021 at 1:18 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, May 3, 2021 at 1:45 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > No bug. This commit adds some additional cases for bench-memchr.c
> > including testing medium sizes and testing short length with both an
> > inbound match and out of bound match.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> > benchtests/bench-memchr.c | 13 +++++++++++++
> > 1 file changed, 13 insertions(+)
> >
> > diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
> > index f5ced9d80d..5573f93312 100644
> > --- a/benchtests/bench-memchr.c
> > +++ b/benchtests/bench-memchr.c
> > @@ -135,12 +135,25 @@ test_main (void)
> > do_test (i, i, 256, 0);
> > #endif
> > }
> > + for (i = 1; i < 8; ++i)
> > + {
> > + do_test (i, i << 5, 192, 23);
> > + do_test (i, i << 5, 192, 0);
> > + do_test (i, i << 5, 256, 23);
> > + do_test (i, i << 5, 256, 0);
> > + do_test (i, i << 5, 512, 23);
> > + do_test (i, i << 5, 512, 0);
> > + }
> > for (i = 1; i < 32; ++i)
> > {
> > do_test (0, i, i + 1, 23);
> > do_test (0, i, i + 1, 0);
> > do_test (i, i, i + 1, 23);
> > do_test (i, i, i + 1, 0);
> > + do_test (0, i, i - 1, 23);
> > + do_test (0, i, i - 1, 0);
> > + do_test (i, i, i - 1, 23);
> > + do_test (i, i, i - 1, 0);
> > #ifdef USE_AS_MEMRCHR
> > /* Also test the position close to the beginning for memrchr. */
> > do_test (0, 1, i + 1, 23);
> > --
> > 2.29.2
> >
>
> LGTM. I will check it in for you.
Thanks!
>
> BTW, can you apply an account on sourceware.org:
>
> https://sourceware.org/
>
> so that you can push your commits directly? You can put me down
> as your sponsor.
Done. Are there any wikis / manuals on how to properly use write access?
All I'm finding are resources on how to obtain it.
>
> Thanks.
>
> --
> H.J.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v1 2/3] x86: Optimize memchr-avx2.S
2021-05-03 18:50 ` H.J. Lu
@ 2021-05-03 20:06 ` Noah Goldstein
0 siblings, 0 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 20:06 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Mon, May 3, 2021 at 2:50 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, May 03, 2021 at 04:44:36AM -0400, Noah Goldstein wrote:
> > No bug. This commit optimizes memchr-avx2.S. The optimizations include
> > replacing some branches with cmovcc, avoiding some branches entirely
> > in the less_4x_vec case, making the page cross logic less strict,
> > asaving a few instructions the in loop return loop. test-memchr,
> > test-rawmemchr, and test-wmemchr are all passing.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> > sysdeps/x86_64/multiarch/memchr-avx2.S | 446 +++++++++++++++----------
> > 1 file changed, 262 insertions(+), 184 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > index 1fcb1c350f..8368fcd1e1 100644
> > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > @@ -26,8 +26,22 @@
> >
> > # ifdef USE_AS_WMEMCHR
> > # define VPCMPEQ vpcmpeqd
> > +# define VPBROADCAST vpbroadcastd
> > +# define CHAR_SIZE 4
> > # else
> > # define VPCMPEQ vpcmpeqb
> > +# define VPBROADCAST vpbroadcastb
> > +# define CHAR_SIZE 1
> > +# endif
> > +
> > +# ifdef USE_AS_RAWMEMCHR
> > +# define ERAW_PTR_REG ecx
> > +# define RRAW_PTR_REG rcx
> > +# define ALGN_PTR_REG rdi
> > +# else
> > +# define ERAW_PTR_REG edi
> > +# define RRAW_PTR_REG rdi
> > +# define ALGN_PTR_REG rcx
> > # endif
> >
> > # ifndef VZEROUPPER
> > @@ -39,303 +53,367 @@
> > # endif
> >
> > # define VEC_SIZE 32
> > +# define PAGE_SIZE 4096
> > +
>
> Remove the extra line here.
Done.
>
> >
> > .section SECTION(.text),"ax",@progbits
> > -ENTRY (MEMCHR)
> > +ENTRY(MEMCHR)
>
> No need for this change.
Fixed.
>
> > # ifndef USE_AS_RAWMEMCHR
> > /* Check for zero length. */
> > test %RDX_LP, %RDX_LP
> > jz L(null)
> > # endif
> > - movl %edi, %ecx
> > - /* Broadcast CHAR to YMM0. */
> > - vmovd %esi, %xmm0
> > # ifdef USE_AS_WMEMCHR
> > shl $2, %RDX_LP
> > - vpbroadcastd %xmm0, %ymm0
> > # else
> > # ifdef __ILP32__
> > /* Clear the upper 32 bits. */
> > movl %edx, %edx
> > # endif
> > - vpbroadcastb %xmm0, %ymm0
> > # endif
> > - /* Check if we may cross page boundary with one vector load. */
> > - andl $(2 * VEC_SIZE - 1), %ecx
> > - cmpl $VEC_SIZE, %ecx
> > - ja L(cros_page_boundary)
> > + /* Broadcast CHAR to YMMMATCH. */
> > + vmovd %esi, %xmm0
> > + VPBROADCAST %xmm0, %ymm0
> > + /* Check if we may cross page boundary with one
> > + vector load. */
> > + movl %edi, %eax
> > + andl $(PAGE_SIZE - 1), %eax
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > + ja L(cross_page_boundary)
> >
> > /* Check the first VEC_SIZE bytes. */
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > + VPCMPEQ (%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > -
> > # ifndef USE_AS_RAWMEMCHR
> > - jnz L(first_vec_x0_check)
> > - /* Adjust length and check the end of data. */
> > - subq $VEC_SIZE, %rdx
> > - jbe L(zero)
> > -# else
> > - jnz L(first_vec_x0)
> > + /* If length < CHAR_PER_VEC handle special. */
> > + cmpq $VEC_SIZE, %rdx
> > + jbe L(first_vec_x0)
> > # endif
> > -
> > - /* Align data for aligned loads in the loop. */
> > - addq $VEC_SIZE, %rdi
> > - andl $(VEC_SIZE - 1), %ecx
> > - andq $-VEC_SIZE, %rdi
> > + testl %eax, %eax
> > + jz L(aligned_more)
> > + tzcntl %eax, %eax
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> >
> > # ifndef USE_AS_RAWMEMCHR
> > - /* Adjust length. */
> > - addq %rcx, %rdx
> > -
> > - subq $(VEC_SIZE * 4), %rdx
> > - jbe L(last_4x_vec_or_less)
> > + .p2align 5
> > +L(first_vec_x0):
> > + /* Check if first match was before length. */
> > + tzcntl %eax, %eax
> > + xorl %ecx, %ecx
> > + cmpl %eax, %edx
> > + leaq (%rdi, %rax), %rax
> > + cmovle %rcx, %rax
> > + VZEROUPPER_RETURN
> > +L(null):
> > + xorl %eax, %eax
> > + ret
> > # endif
> > - jmp L(more_4x_vec)
> > -
> > .p2align 4
> > -L(cros_page_boundary):
> > - andl $(VEC_SIZE - 1), %ecx
> > - andq $-VEC_SIZE, %rdi
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > +L(cross_page_boundary):
> > + /* Save pointer before aligning as its original
> > + value is necessary for computer return address if byte is
> > + found or adjusting length if it is not and this is
>
> Fit comments to 72 columns.
Fixed. Still working out the kinks in my formatter.
For the 72 column fill does tab count as 1, 4, or 8 units?
>
> > + memchr. */
> > + movq %rdi, %rcx
> > + /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is
> > + rcx for memchr and rdi for rawmemchr. */
> > + orq $(VEC_SIZE - 1), %ALGN_PTR_REG
> > + VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > + /* Calculate length until end of page (length
> > + checked for a match). */
> > + leaq 1(%ALGN_PTR_REG), %rsi
> > + subq %RRAW_PTR_REG, %rsi
> > +# endif
> > /* Remove the leading bytes. */
> > - sarl %cl, %eax
> > - testl %eax, %eax
> > - jz L(aligned_more)
> > - tzcntl %eax, %eax
> > + sarxl %ERAW_PTR_REG, %eax, %eax
> > # ifndef USE_AS_RAWMEMCHR
> > /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > + cmpq %rsi, %rdx
> > + jbe L(first_vec_x0)
> > # endif
> > - addq %rdi, %rax
> > - addq %rcx, %rax
> > + testl %eax, %eax
> > + jz L(cross_page_continue)
> > + tzcntl %eax, %eax
> > + addq %RRAW_PTR_REG, %rax
> > L(return_vzeroupper):
> > ZERO_UPPER_VEC_REGISTERS_RETURN
> >
> > .p2align 4
> > -L(aligned_more):
> > -# ifndef USE_AS_RAWMEMCHR
> > - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> > - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> > - overflow. */
> > - negq %rcx
> > - addq $VEC_SIZE, %rcx
> > +L(first_vec_x1):
> > + tzcntl %eax, %eax
> > + incq %rdi
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> >
> > - /* Check the end of data. */
> > - subq %rcx, %rdx
> > - jbe L(zero)
> > -# endif
> > + .p2align 4
> > +L(first_vec_x2):
> > + tzcntl %eax, %eax
> > + addq $(VEC_SIZE + 1), %rdi
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> >
> > - addq $VEC_SIZE, %rdi
> > + .p2align 4
> > +L(first_vec_x3):
> > + tzcntl %eax, %eax
> > + addq $(VEC_SIZE * 2 + 1), %rdi
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> >
> > -# ifndef USE_AS_RAWMEMCHR
> > - subq $(VEC_SIZE * 4), %rdx
> > - jbe L(last_4x_vec_or_less)
> > -# endif
> >
> > -L(more_4x_vec):
> > - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > - since data is only aligned to VEC_SIZE. */
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x0)
> > + .p2align 4
> > +L(first_vec_x4):
> > + tzcntl %eax, %eax
> > + addq $(VEC_SIZE * 3 + 1), %rdi
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> >
> > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > + .p2align 4
> > +L(aligned_more):
> > + /* Check the first 4 * VEC_SIZE. Only one
> > + VEC_SIZE at a time since data is only aligned to
> > + VEC_SIZE. */
>
> Fit comments to 72 columns.
Adjusted closer. Hopefully fixed.
>
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +L(cross_page_continue):
> > + /* Align data to VEC_SIZE - 1. */
> > + xorl %ecx, %ecx
> > + subl %edi, %ecx
> > + orq $(VEC_SIZE - 1), %rdi
> > + /* esi is for adjusting length to see if near the
> > + end. */
>
> Fit comments to 72 columns.
Adjusted closer. Hopefully fixed.
>
> > + leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> > +# else
> > + orq $(VEC_SIZE - 1), %rdi
> > +L(cross_page_continue):
> > +# endif
> > + /* Load first VEC regardless. */
> > + VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > + /* Adjust length. If near end handle specially.
> > + */
>
> Put the comments on one line.
Fixed.
>
> > + subq %rsi, %rdx
> > + jbe L(last_4x_vec_or_less)
> > +# endif
> > testl %eax, %eax
> > jnz L(first_vec_x1)
> >
> > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x2)
> >
> > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x3)
> >
> > - addq $(VEC_SIZE * 4), %rdi
> > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > + vpmovmskb %ymm1, %eax
> > + testl %eax, %eax
> > + jnz L(first_vec_x4)
> >
> > # ifndef USE_AS_RAWMEMCHR
> > + /* Check if at last VEC_SIZE * 4 length. */
> > subq $(VEC_SIZE * 4), %rdx
> > - jbe L(last_4x_vec_or_less)
> > -# endif
> > -
> > - /* Align data to 4 * VEC_SIZE. */
> > - movq %rdi, %rcx
> > - andl $(4 * VEC_SIZE - 1), %ecx
> > - andq $-(4 * VEC_SIZE), %rdi
> > -
> > -# ifndef USE_AS_RAWMEMCHR
> > - /* Adjust length. */
> > + jbe L(last_4x_vec_or_less_cmpeq)
> > + /* Align data to VEC_SIZE * 4 - 1 for the loop
> > + and readjust length. */
> > + incq %rdi
> > + movl %edi, %ecx
> > + orq $(VEC_SIZE * 4 - 1), %rdi
> > + andl $(VEC_SIZE * 4 - 1), %ecx
> > addq %rcx, %rdx
> > +# else
> > + /* Align data to VEC_SIZE * 4 - 1 for loop. */
> > + incq %rdi
> > + orq $(VEC_SIZE * 4 - 1), %rdi
> > # endif
> >
> > + /* Compare 4 * VEC at a time forward. */
> > .p2align 4
> > L(loop_4x_vec):
> > - /* Compare 4 * VEC at a time forward. */
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
> > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
> > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
> > -
> > + VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
> > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
> > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
> > vpor %ymm1, %ymm2, %ymm5
> > vpor %ymm3, %ymm4, %ymm6
> > vpor %ymm5, %ymm6, %ymm5
> >
> > - vpmovmskb %ymm5, %eax
> > - testl %eax, %eax
> > - jnz L(4x_vec_end)
> > -
> > - addq $(VEC_SIZE * 4), %rdi
> > -
> > + vpmovmskb %ymm5, %ecx
> > # ifdef USE_AS_RAWMEMCHR
> > - jmp L(loop_4x_vec)
> > + subq $-(VEC_SIZE * 4), %rdi
> > + testl %ecx, %ecx
> > + jz L(loop_4x_vec)
> > # else
> > - subq $(VEC_SIZE * 4), %rdx
> > - ja L(loop_4x_vec)
> > + testl %ecx, %ecx
> > + jnz L(loop_4x_vec_end)
> >
> > -L(last_4x_vec_or_less):
> > - /* Less than 4 * VEC and aligned to VEC_SIZE. */
> > - addl $(VEC_SIZE * 2), %edx
> > - jle L(last_2x_vec)
> > + subq $-(VEC_SIZE * 4), %rdi
> >
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x0)
> > + subq $(VEC_SIZE * 4), %rdx
> > + ja L(loop_4x_vec)
> >
> > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > + /* Fall through into less than 4 remaining
> > + vectors of length case. */
>
> Fit comments to 72 columns.
Adjusted closer. Hopefully fixed.
>
> > + VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > + .p2align 4
> > +L(last_4x_vec_or_less):
> > + /* Check if first VEC contained match. */
> > testl %eax, %eax
> > - jnz L(first_vec_x1)
> > + jnz L(first_vec_x1_check)
> >
> > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > + /* If remaining length > VEC_SIZE * 2. */
> > + addl $(VEC_SIZE * 2), %edx
> > + jg L(last_4x_vec)
> >
> > - jnz L(first_vec_x2_check)
> > - subl $VEC_SIZE, %edx
> > - jle L(zero)
> > +L(last_2x_vec):
> > + /* If remaining length < VEC_SIZE. */
> > + addl $VEC_SIZE, %edx
> > + jle L(zero_end)
> >
> > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > + /* Check VEC2 and compare any match with
> > + remaining length. */
>
> Fit comments to 72 columns.
Adjusted closer. Hopefully fixed.
>
> > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > -
> > - jnz L(first_vec_x3_check)
> > - xorl %eax, %eax
> > + tzcntl %eax, %eax
> > + cmpl %eax, %edx
> > + jbe L(set_zero_end)
> > + addq $(VEC_SIZE + 1), %rdi
> > + addq %rdi, %rax
> > +L(zero_end):
> > VZEROUPPER_RETURN
> >
> > .p2align 4
> > -L(last_2x_vec):
> > - addl $(VEC_SIZE * 2), %edx
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > +L(loop_4x_vec_end):
> > +# endif
> > + /* rawmemchr will fall through into this if match
> > + was found in loop. */
>
> Fit comments to 72 columns.
Adjusted closer. Hopefully fixed.
>
> > +
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > + jnz L(last_vec_x1_return)
> >
> > - jnz L(first_vec_x0_check)
> > - subl $VEC_SIZE, %edx
> > - jle L(zero)
> > -
> > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vpmovmskb %ymm2, %eax
> > testl %eax, %eax
> > - jnz L(first_vec_x1_check)
> > - xorl %eax, %eax
> > - VZEROUPPER_RETURN
> > + jnz L(last_vec_x2_return)
> >
> > - .p2align 4
> > -L(first_vec_x0_check):
> > - tzcntl %eax, %eax
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > + vpmovmskb %ymm3, %eax
> > + /* Combine VEC3 matches (eax) with VEC4 matches
> > + (ecx). */
>
> Fit comments to 72 columns.
Adjusted closer. Hopefully fixed.
>
> > + salq $32, %rcx
> > + orq %rcx, %rax
> > + tzcntq %rax, %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > + subq $(VEC_SIZE * 2 - 1), %rdi
> > +# else
> > + subq $-(VEC_SIZE * 2 + 1), %rdi
> > +# endif
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> > +# ifndef USE_AS_RAWMEMCHR
> >
> > .p2align 4
> > L(first_vec_x1_check):
> > tzcntl %eax, %eax
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq $VEC_SIZE, %rax
> > + /* Adjust length. */
> > + subl $-(VEC_SIZE * 4), %edx
> > + /* Check if match within remaining length. */
> > + cmpl %eax, %edx
> > + jbe L(set_zero_end)
> > + incq %rdi
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> > + .p2align 4
> > +L(set_zero_end):
> > + xorl %eax, %eax
> > + VZEROUPPER_RETURN
> > +# endif
> >
> > .p2align 4
> > -L(first_vec_x2_check):
> > +L(last_vec_x1_return):
> > tzcntl %eax, %eax
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq $(VEC_SIZE * 2), %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > + subq $(VEC_SIZE * 4 - 1), %rdi
> > +# else
> > + incq %rdi
> > +# endif
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> >
> > .p2align 4
> > -L(first_vec_x3_check):
> > +L(last_vec_x2_return):
> > tzcntl %eax, %eax
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq $(VEC_SIZE * 3), %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > + subq $(VEC_SIZE * 3 - 1), %rdi
> > +# else
> > + subq $-(VEC_SIZE + 1), %rdi
> > +# endif
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> >
> > +# ifndef USE_AS_RAWMEMCHR
> > .p2align 4
> > -L(zero):
> > - xorl %eax, %eax
> > - jmp L(return_vzeroupper)
> > +L(last_4x_vec_or_less_cmpeq):
> > + VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
> > + vpmovmskb %ymm1, %eax
> > + subq $-(VEC_SIZE * 4), %rdi
> > + /* Check first VEC regardless. */
> > + testl %eax, %eax
> > + jnz L(first_vec_x1_check)
> >
> > + /* If remaining length <= CHAR_PER_VEC * 2. */
> > + addl $(VEC_SIZE * 2), %edx
> > + jle L(last_2x_vec)
> > .p2align 4
> > -L(null):
> > - xorl %eax, %eax
> > - ret
> > -# endif
> > +L(last_4x_vec):
> > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > + vpmovmskb %ymm1, %eax
> > + testl %eax, %eax
> > + jnz L(last_vec_x2_return)
> >
> > - .p2align 4
> > -L(first_vec_x0):
> > - tzcntl %eax, %eax
> > - addq %rdi, %rax
> > - VZEROUPPER_RETURN
> > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > + vpmovmskb %ymm1, %eax
> >
> > - .p2align 4
> > -L(first_vec_x1):
> > - tzcntl %eax, %eax
> > - addq $VEC_SIZE, %rax
> > - addq %rdi, %rax
> > - VZEROUPPER_RETURN
> > + /* Create mask for possible matches within
> > + remaining length. */
>
> Fit comments to 72 columns.
Adjusted closer. Hopefully fixed.
>
> > + movq $-1, %rcx
> > + bzhiq %rdx, %rcx, %rcx
> >
> > - .p2align 4
> > -L(first_vec_x2):
> > + /* Test matches in data against length match. */
> > + andl %ecx, %eax
> > + jnz L(last_vec_x3)
> > +
> > + /* if remaining length <= VEC_SIZE * 3 (Note this
> > + is after remaining length was found to be > VEC_SIZE * 2.
>
> Fit comments to 72 columns.
Adjusted closer. Hopefully fixed.
>
> > + */
> > + subl $VEC_SIZE, %edx
> > + jbe L(zero_end2)
> > +
> > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > + vpmovmskb %ymm1, %eax
> > + /* Shift remaining length mask for last VEC. */
> > + shrq $32, %rcx
> > + andl %ecx, %eax
> > + jz L(zero_end2)
> > tzcntl %eax, %eax
> > - addq $(VEC_SIZE * 2), %rax
> > + addq $(VEC_SIZE * 3 + 1), %rdi
> > addq %rdi, %rax
> > +L(zero_end2):
> > VZEROUPPER_RETURN
> >
> > .p2align 4
> > -L(4x_vec_end):
> > - vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x0)
> > - vpmovmskb %ymm2, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x1)
> > - vpmovmskb %ymm3, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x2)
> > - vpmovmskb %ymm4, %eax
> > - testl %eax, %eax
> > -L(first_vec_x3):
> > +L(last_vec_x3):
> > tzcntl %eax, %eax
> > - addq $(VEC_SIZE * 3), %rax
> > + subq $-(VEC_SIZE * 2 + 1), %rdi
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> > +# endif
> >
> > -END (MEMCHR)
> > +END(MEMCHR)
>
> No need for this change.
Fixed.
>
> > #endif
> > --
> > 2.29.2
> >
>
> Thanks.
>
> H.J.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v1 3/3] x86: Optimize memchr-evex.S
2021-05-03 18:58 ` H.J. Lu
@ 2021-05-03 20:06 ` Noah Goldstein
0 siblings, 0 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 20:06 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Mon, May 3, 2021 at 2:58 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, May 03, 2021 at 04:44:38AM -0400, Noah Goldstein wrote:
> > No bug. This commit optimizes memchr-evex.S. The optimizations include
> > replacing some branches with cmovcc, avoiding some branches entirely
> > in the less_4x_vec case, making the page cross logic less strict,
> > saving some ALU in the alignment process, and most importantly
> > increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
> > test-wmemchr are all passing.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> > Tests where run on the following CPUs:
> >
> > Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
> >
> > Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html
> >
> > Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html
> >
> > All times are the geometric mean of N=20. The unit of time is
> > seconds.
> >
> > "Cur" refers to the current implementation
> > "New" refers to this patches implementation
> >
> > Note: The numbers for size = [1, 32] are highly dependent on function
> > alignment. That being said the new implementation which uses cmovcc
> > instead of a branch (mostly for the reason of high variance with
> > different alignments) for the [1, 32] case is far more consistent and
> > performs about as well (and should only be a bigger improvement in
> > cases where the sizes / position are not 100% predictable).
> >
> > For memchr-evex the numbers are a near universal improvement. The case
> > where the current implement as better is for size = 0 and for size =
> > [1, 32] with pos < size the two implementations are about the
> > same. For size = [1, 32] with pos > size, for medium range sizes, and
> > large size, however, the new implementation is faster.
> >
> > Results For Tigerlake memchr-evex
> > size , algn , Pos , Cur T , New T , Win , Dif
> > 2048 , 0 , , 32 5.58 , 5.22 , New , 0.36
> > 256 , 1 , , 64 5.22 , 4.93 , New , 0.29
> > 2048 , 0 , , 64 5.22 , 4.89 , New , 0.33
> > 256 , 2 , , 64 5.14 , 4.81 , New , 0.33
> > 2048 , 0 , , 128 6.3 , 5.67 , New , 0.63
> > 256 , 3 , , 64 5.22 , 4.9 , New , 0.32
> > 2048 , 0 , , 256 11.07 , 10.92 , New , 0.15
> > 256 , 4 , , 64 5.16 , 4.86 , New , 0.3
> > 2048 , 0 , , 512 15.66 , 14.81 , New , 0.85
> > 256 , 5 , , 64 5.15 , 4.84 , New , 0.31
> > 2048 , 0 , , 1024 25.7 , 23.02 , New , 2.68
> > 256 , 6 , , 64 5.12 , 4.89 , New , 0.23
> > 2048 , 0 , , 2048 42.34 , 37.71 , New , 4.63
> > 256 , 7 , , 64 5.03 , 4.62 , New , 0.41
> > 192 , 1 , , 32 4.96 , 4.28 , New , 0.68
> > 256 , 1 , , 32 4.95 , 4.28 , New , 0.67
> > 512 , 1 , , 32 4.94 , 4.29 , New , 0.65
> > 192 , 2 , , 64 5.1 , 4.8 , New , 0.3
> > 512 , 2 , , 64 5.12 , 4.72 , New , 0.4
> > 192 , 3 , , 96 5.54 , 5.12 , New , 0.42
> > 256 , 3 , , 96 5.52 , 5.15 , New , 0.37
> > 512 , 3 , , 96 5.51 , 5.16 , New , 0.35
> > 192 , 4 , , 128 6.1 , 5.53 , New , 0.57
> > 256 , 4 , , 128 6.09 , 5.49 , New , 0.6
> > 512 , 4 , , 128 6.08 , 5.48 , New , 0.6
> > 192 , 5 , , 160 7.42 , 6.71 , New , 0.71
> > 256 , 5 , , 160 6.86 , 6.71 , New , 0.15
> > 512 , 5 , , 160 9.28 , 8.68 , New , 0.6
> > 192 , 6 , , 192 7.94 , 7.47 , New , 0.47
> > 256 , 6 , , 192 7.62 , 7.17 , New , 0.45
> > 512 , 6 , , 192 9.2 , 9.16 , New , 0.04
> > 192 , 7 , , 224 8.02 , 7.43 , New , 0.59
> > 256 , 7 , , 224 8.34 , 7.85 , New , 0.49
> > 512 , 7 , , 224 9.89 , 9.16 , New , 0.73
> > 2 , 0 , , 1 3.0 , 3.0 , Eq , 0.0
> > 2 , 1 , , 1 3.0 , 3.0 , Eq , 0.0
> > 0 , 0 , , 1 3.01 , 3.6 , Cur , 0.59
> > 0 , 1 , , 1 3.01 , 3.6 , Cur , 0.59
> > 3 , 0 , , 2 3.0 , 3.0 , Eq , 0.0
> > 3 , 2 , , 2 3.0 , 3.0 , Eq , 0.0
> > 1 , 0 , , 2 3.6 , 3.0 , New , 0.6
> > 1 , 2 , , 2 3.6 , 3.0 , New , 0.6
> > 4 , 0 , , 3 3.01 , 3.01 , Eq , 0.0
> > 4 , 3 , , 3 3.01 , 3.01 , Eq , 0.0
> > 2 , 0 , , 3 3.62 , 3.02 , New , 0.6
> > 2 , 3 , , 3 3.62 , 3.03 , New , 0.59
> > 5 , 0 , , 4 3.02 , 3.03 , Cur , 0.01
> > 5 , 4 , , 4 3.02 , 3.02 , Eq , 0.0
> > 3 , 0 , , 4 3.63 , 3.02 , New , 0.61
> > 3 , 4 , , 4 3.63 , 3.04 , New , 0.59
> > 6 , 0 , , 5 3.05 , 3.04 , New , 0.01
> > 6 , 5 , , 5 3.02 , 3.02 , Eq , 0.0
> > 4 , 0 , , 5 3.63 , 3.02 , New , 0.61
> > 4 , 5 , , 5 3.64 , 3.03 , New , 0.61
> > 7 , 0 , , 6 3.03 , 3.03 , Eq , 0.0
> > 7 , 6 , , 6 3.02 , 3.02 , Eq , 0.0
> > 5 , 0 , , 6 3.64 , 3.01 , New , 0.63
> > 5 , 6 , , 6 3.64 , 3.03 , New , 0.61
> > 8 , 0 , , 7 3.03 , 3.04 , Cur , 0.01
> > 8 , 7 , , 7 3.04 , 3.04 , Eq , 0.0
> > 6 , 0 , , 7 3.67 , 3.04 , New , 0.63
> > 6 , 7 , , 7 3.65 , 3.05 , New , 0.6
> > 9 , 0 , , 8 3.05 , 3.05 , Eq , 0.0
> > 7 , 0 , , 8 3.67 , 3.05 , New , 0.62
> > 10 , 0 , , 9 3.06 , 3.06 , Eq , 0.0
> > 10 , 1 , , 9 3.06 , 3.06 , Eq , 0.0
> > 8 , 0 , , 9 3.67 , 3.06 , New , 0.61
> > 8 , 1 , , 9 3.67 , 3.06 , New , 0.61
> > 11 , 0 , , 10 3.06 , 3.06 , Eq , 0.0
> > 11 , 2 , , 10 3.07 , 3.06 , New , 0.01
> > 9 , 0 , , 10 3.67 , 3.05 , New , 0.62
> > 9 , 2 , , 10 3.67 , 3.06 , New , 0.61
> > 12 , 0 , , 11 3.06 , 3.06 , Eq , 0.0
> > 12 , 3 , , 11 3.06 , 3.06 , Eq , 0.0
> > 10 , 0 , , 11 3.67 , 3.06 , New , 0.61
> > 10 , 3 , , 11 3.67 , 3.06 , New , 0.61
> > 13 , 0 , , 12 3.06 , 3.07 , Cur , 0.01
> > 13 , 4 , , 12 3.06 , 3.07 , Cur , 0.01
> > 11 , 0 , , 12 3.67 , 3.11 , New , 0.56
> > 11 , 4 , , 12 3.68 , 3.12 , New , 0.56
> > 14 , 0 , , 13 3.07 , 3.1 , Cur , 0.03
> > 14 , 5 , , 13 3.06 , 3.07 , Cur , 0.01
> > 12 , 0 , , 13 3.67 , 3.07 , New , 0.6
> > 12 , 5 , , 13 3.67 , 3.08 , New , 0.59
> > 15 , 0 , , 14 3.06 , 3.06 , Eq , 0.0
> > 15 , 6 , , 14 3.07 , 3.06 , New , 0.01
> > 13 , 0 , , 14 3.67 , 3.06 , New , 0.61
> > 13 , 6 , , 14 3.68 , 3.06 , New , 0.62
> > 16 , 0 , , 15 3.06 , 3.06 , Eq , 0.0
> > 16 , 7 , , 15 3.06 , 3.05 , New , 0.01
> > 14 , 0 , , 15 3.68 , 3.06 , New , 0.62
> > 14 , 7 , , 15 3.67 , 3.06 , New , 0.61
> > 17 , 0 , , 16 3.07 , 3.06 , New , 0.01
> > 15 , 0 , , 16 3.68 , 3.06 , New , 0.62
> > 18 , 0 , , 17 3.06 , 3.06 , Eq , 0.0
> > 18 , 1 , , 17 3.06 , 3.06 , Eq , 0.0
> > 16 , 0 , , 17 3.67 , 3.06 , New , 0.61
> > 16 , 1 , , 17 3.67 , 3.05 , New , 0.62
> > 19 , 0 , , 18 3.07 , 3.06 , New , 0.01
> > 19 , 2 , , 18 3.06 , 3.06 , Eq , 0.0
> > 17 , 0 , , 18 3.68 , 3.08 , New , 0.6
> > 17 , 2 , , 18 3.68 , 3.06 , New , 0.62
> > 20 , 0 , , 19 3.06 , 3.06 , Eq , 0.0
> > 20 , 3 , , 19 3.06 , 3.06 , Eq , 0.0
> > 18 , 0 , , 19 3.68 , 3.06 , New , 0.62
> > 18 , 3 , , 19 3.68 , 3.06 , New , 0.62
> > 21 , 0 , , 20 3.06 , 3.06 , Eq , 0.0
> > 21 , 4 , , 20 3.06 , 3.06 , Eq , 0.0
> > 19 , 0 , , 20 3.67 , 3.06 , New , 0.61
> > 19 , 4 , , 20 3.67 , 3.06 , New , 0.61
> > 22 , 0 , , 21 3.06 , 3.06 , Eq , 0.0
> > 22 , 5 , , 21 3.06 , 3.06 , Eq , 0.0
> > 20 , 0 , , 21 3.67 , 3.05 , New , 0.62
> > 20 , 5 , , 21 3.68 , 3.06 , New , 0.62
> > 23 , 0 , , 22 3.07 , 3.06 , New , 0.01
> > 23 , 6 , , 22 3.06 , 3.06 , Eq , 0.0
> > 21 , 0 , , 22 3.68 , 3.07 , New , 0.61
> > 21 , 6 , , 22 3.67 , 3.06 , New , 0.61
> > 24 , 0 , , 23 3.19 , 3.06 , New , 0.13
> > 24 , 7 , , 23 3.08 , 3.06 , New , 0.02
> > 22 , 0 , , 23 3.69 , 3.06 , New , 0.63
> > 22 , 7 , , 23 3.68 , 3.06 , New , 0.62
> > 25 , 0 , , 24 3.07 , 3.06 , New , 0.01
> > 23 , 0 , , 24 3.68 , 3.06 , New , 0.62
> > 26 , 0 , , 25 3.06 , 3.05 , New , 0.01
> > 26 , 1 , , 25 3.07 , 3.06 , New , 0.01
> > 24 , 0 , , 25 3.67 , 3.05 , New , 0.62
> > 24 , 1 , , 25 3.68 , 3.06 , New , 0.62
> > 27 , 0 , , 26 3.12 , 3.06 , New , 0.06
> > 27 , 2 , , 26 3.08 , 3.06 , New , 0.02
> > 25 , 0 , , 26 3.69 , 3.06 , New , 0.63
> > 25 , 2 , , 26 3.67 , 3.06 , New , 0.61
> > 28 , 0 , , 27 3.06 , 3.06 , Eq , 0.0
> > 28 , 3 , , 27 3.06 , 3.06 , Eq , 0.0
> > 26 , 0 , , 27 3.67 , 3.06 , New , 0.61
> > 26 , 3 , , 27 3.67 , 3.06 , New , 0.61
> > 29 , 0 , , 28 3.06 , 3.06 , Eq , 0.0
> > 29 , 4 , , 28 3.06 , 3.06 , Eq , 0.0
> > 27 , 0 , , 28 3.68 , 3.05 , New , 0.63
> > 27 , 4 , , 28 3.67 , 3.06 , New , 0.61
> > 30 , 0 , , 29 3.06 , 3.06 , Eq , 0.0
> > 30 , 5 , , 29 3.06 , 3.06 , Eq , 0.0
> > 28 , 0 , , 29 3.67 , 3.06 , New , 0.61
> > 28 , 5 , , 29 3.68 , 3.06 , New , 0.62
> > 31 , 0 , , 30 3.06 , 3.06 , Eq , 0.0
> > 31 , 6 , , 30 3.06 , 3.06 , Eq , 0.0
> > 29 , 0 , , 30 3.68 , 3.06 , New , 0.62
> > 29 , 6 , , 30 3.7 , 3.06 , New , 0.64
> > 32 , 0 , , 31 3.17 , 3.06 , New , 0.11
> > 32 , 7 , , 31 3.12 , 3.06 , New , 0.06
> > 30 , 0 , , 31 3.68 , 3.06 , New , 0.62
> > 30 , 7 , , 31 3.68 , 3.06 , New , 0.62
> >
> > Results For Icelake memchr-evex
> > size , algn , Pos , Cur T , New T , Win , Dif
> > 2048 , 0 , , 32 4.94 , 4.26 , New , 0.68
> > 256 , 1 , , 64 4.5 , 4.13 , New , 0.37
> > 2048 , 0 , , 64 4.19 , 3.9 , New , 0.29
> > 256 , 2 , , 64 4.19 , 3.87 , New , 0.32
> > 2048 , 0 , , 128 4.96 , 4.53 , New , 0.43
> > 256 , 3 , , 64 4.07 , 3.86 , New , 0.21
> > 2048 , 0 , , 256 8.77 , 8.61 , New , 0.16
> > 256 , 4 , , 64 4.08 , 3.87 , New , 0.21
> > 2048 , 0 , , 512 12.22 , 11.67 , New , 0.55
> > 256 , 5 , , 64 4.12 , 3.83 , New , 0.29
> > 2048 , 0 , , 1024 20.06 , 18.09 , New , 1.97
> > 256 , 6 , , 64 4.2 , 3.95 , New , 0.25
> > 2048 , 0 , , 2048 33.83 , 30.62 , New , 3.21
> > 256 , 7 , , 64 4.3 , 4.04 , New , 0.26
> > 192 , 1 , , 32 4.2 , 3.71 , New , 0.49
> > 256 , 1 , , 32 4.24 , 3.76 , New , 0.48
> > 512 , 1 , , 32 4.29 , 3.74 , New , 0.55
> > 192 , 2 , , 64 4.42 , 4.0 , New , 0.42
> > 512 , 2 , , 64 4.17 , 3.83 , New , 0.34
> > 192 , 3 , , 96 4.44 , 4.26 , New , 0.18
> > 256 , 3 , , 96 4.45 , 4.14 , New , 0.31
> > 512 , 3 , , 96 4.42 , 4.15 , New , 0.27
> > 192 , 4 , , 128 4.93 , 4.45 , New , 0.48
> > 256 , 4 , , 128 4.93 , 4.47 , New , 0.46
> > 512 , 4 , , 128 4.95 , 4.47 , New , 0.48
> > 192 , 5 , , 160 5.95 , 5.44 , New , 0.51
> > 256 , 5 , , 160 5.59 , 5.47 , New , 0.12
> > 512 , 5 , , 160 7.59 , 7.34 , New , 0.25
> > 192 , 6 , , 192 6.53 , 6.08 , New , 0.45
> > 256 , 6 , , 192 6.2 , 5.88 , New , 0.32
> > 512 , 6 , , 192 7.53 , 7.62 , Cur , 0.09
> > 192 , 7 , , 224 6.62 , 6.12 , New , 0.5
> > 256 , 7 , , 224 6.79 , 6.51 , New , 0.28
> > 512 , 7 , , 224 8.12 , 7.61 , New , 0.51
> > 2 , 0 , , 1 2.5 , 2.54 , Cur , 0.04
> > 2 , 1 , , 1 2.56 , 2.55 , New , 0.01
> > 0 , 0 , , 1 2.57 , 3.12 , Cur , 0.55
> > 0 , 1 , , 1 2.59 , 3.14 , Cur , 0.55
> > 3 , 0 , , 2 2.62 , 2.63 , Cur , 0.01
> > 3 , 2 , , 2 2.66 , 2.67 , Cur , 0.01
> > 1 , 0 , , 2 3.24 , 2.72 , New , 0.52
> > 1 , 2 , , 2 3.28 , 2.75 , New , 0.53
> > 4 , 0 , , 3 2.78 , 2.8 , Cur , 0.02
> > 4 , 3 , , 3 2.8 , 2.82 , Cur , 0.02
> > 2 , 0 , , 3 3.38 , 2.86 , New , 0.52
> > 2 , 3 , , 3 3.41 , 2.89 , New , 0.52
> > 5 , 0 , , 4 2.88 , 2.91 , Cur , 0.03
> > 5 , 4 , , 4 2.88 , 2.92 , Cur , 0.04
> > 3 , 0 , , 4 3.48 , 2.93 , New , 0.55
> > 3 , 4 , , 4 3.47 , 2.93 , New , 0.54
> > 6 , 0 , , 5 2.95 , 2.94 , New , 0.01
> > 6 , 5 , , 5 2.91 , 2.92 , Cur , 0.01
> > 4 , 0 , , 5 3.47 , 2.9 , New , 0.57
> > 4 , 5 , , 5 3.43 , 2.91 , New , 0.52
> > 7 , 0 , , 6 2.87 , 2.9 , Cur , 0.03
> > 7 , 6 , , 6 2.87 , 2.89 , Cur , 0.02
> > 5 , 0 , , 6 3.44 , 2.88 , New , 0.56
> > 5 , 6 , , 6 3.41 , 2.87 , New , 0.54
> > 8 , 0 , , 7 2.86 , 2.87 , Cur , 0.01
> > 8 , 7 , , 7 2.86 , 2.87 , Cur , 0.01
> > 6 , 0 , , 7 3.43 , 2.87 , New , 0.56
> > 6 , 7 , , 7 3.44 , 2.87 , New , 0.57
> > 9 , 0 , , 8 2.86 , 2.88 , Cur , 0.02
> > 7 , 0 , , 8 3.41 , 2.89 , New , 0.52
> > 10 , 0 , , 9 2.83 , 2.87 , Cur , 0.04
> > 10 , 1 , , 9 2.82 , 2.87 , Cur , 0.05
> > 8 , 0 , , 9 3.4 , 2.89 , New , 0.51
> > 8 , 1 , , 9 3.41 , 2.87 , New , 0.54
> > 11 , 0 , , 10 2.83 , 2.88 , Cur , 0.05
> > 11 , 2 , , 10 2.84 , 2.88 , Cur , 0.04
> > 9 , 0 , , 10 3.41 , 2.87 , New , 0.54
> > 9 , 2 , , 10 3.41 , 2.88 , New , 0.53
> > 12 , 0 , , 11 2.83 , 2.89 , Cur , 0.06
> > 12 , 3 , , 11 2.85 , 2.87 , Cur , 0.02
> > 10 , 0 , , 11 3.41 , 2.87 , New , 0.54
> > 10 , 3 , , 11 3.42 , 2.88 , New , 0.54
> > 13 , 0 , , 12 2.86 , 2.87 , Cur , 0.01
> > 13 , 4 , , 12 2.84 , 2.88 , Cur , 0.04
> > 11 , 0 , , 12 3.43 , 2.87 , New , 0.56
> > 11 , 4 , , 12 3.49 , 2.87 , New , 0.62
> > 14 , 0 , , 13 2.85 , 2.86 , Cur , 0.01
> > 14 , 5 , , 13 2.85 , 2.86 , Cur , 0.01
> > 12 , 0 , , 13 3.41 , 2.86 , New , 0.55
> > 12 , 5 , , 13 3.44 , 2.85 , New , 0.59
> > 15 , 0 , , 14 2.83 , 2.87 , Cur , 0.04
> > 15 , 6 , , 14 2.82 , 2.86 , Cur , 0.04
> > 13 , 0 , , 14 3.41 , 2.86 , New , 0.55
> > 13 , 6 , , 14 3.4 , 2.86 , New , 0.54
> > 16 , 0 , , 15 2.84 , 2.86 , Cur , 0.02
> > 16 , 7 , , 15 2.83 , 2.85 , Cur , 0.02
> > 14 , 0 , , 15 3.41 , 2.85 , New , 0.56
> > 14 , 7 , , 15 3.39 , 2.87 , New , 0.52
> > 17 , 0 , , 16 2.83 , 2.87 , Cur , 0.04
> > 15 , 0 , , 16 3.4 , 2.85 , New , 0.55
> > 18 , 0 , , 17 2.83 , 2.86 , Cur , 0.03
> > 18 , 1 , , 17 2.85 , 2.84 , New , 0.01
> > 16 , 0 , , 17 3.41 , 2.85 , New , 0.56
> > 16 , 1 , , 17 3.4 , 2.86 , New , 0.54
> > 19 , 0 , , 18 2.8 , 2.84 , Cur , 0.04
> > 19 , 2 , , 18 2.82 , 2.83 , Cur , 0.01
> > 17 , 0 , , 18 3.39 , 2.86 , New , 0.53
> > 17 , 2 , , 18 3.39 , 2.84 , New , 0.55
> > 20 , 0 , , 19 2.85 , 2.87 , Cur , 0.02
> > 20 , 3 , , 19 2.88 , 2.87 , New , 0.01
> > 18 , 0 , , 19 3.38 , 2.85 , New , 0.53
> > 18 , 3 , , 19 3.4 , 2.85 , New , 0.55
> > 21 , 0 , , 20 2.83 , 2.85 , Cur , 0.02
> > 21 , 4 , , 20 2.88 , 2.85 , New , 0.03
> > 19 , 0 , , 20 3.39 , 2.84 , New , 0.55
> > 19 , 4 , , 20 3.39 , 2.96 , New , 0.43
> > 22 , 0 , , 21 2.84 , 2.9 , Cur , 0.06
> > 22 , 5 , , 21 2.81 , 2.84 , Cur , 0.03
> > 20 , 0 , , 21 3.41 , 2.81 , New , 0.6
> > 20 , 5 , , 21 3.38 , 2.83 , New , 0.55
> > 23 , 0 , , 22 2.8 , 2.82 , Cur , 0.02
> > 23 , 6 , , 22 2.81 , 2.83 , Cur , 0.02
> > 21 , 0 , , 22 3.35 , 2.81 , New , 0.54
> > 21 , 6 , , 22 3.34 , 2.81 , New , 0.53
> > 24 , 0 , , 23 2.77 , 2.84 , Cur , 0.07
> > 24 , 7 , , 23 2.78 , 2.8 , Cur , 0.02
> > 22 , 0 , , 23 3.34 , 2.79 , New , 0.55
> > 22 , 7 , , 23 3.32 , 2.79 , New , 0.53
> > 25 , 0 , , 24 2.77 , 2.8 , Cur , 0.03
> > 23 , 0 , , 24 3.29 , 2.79 , New , 0.5
> > 26 , 0 , , 25 2.73 , 2.78 , Cur , 0.05
> > 26 , 1 , , 25 2.75 , 2.79 , Cur , 0.04
> > 24 , 0 , , 25 3.27 , 2.79 , New , 0.48
> > 24 , 1 , , 25 3.27 , 2.77 , New , 0.5
> > 27 , 0 , , 26 2.72 , 2.78 , Cur , 0.06
> > 27 , 2 , , 26 2.75 , 2.76 , Cur , 0.01
> > 25 , 0 , , 26 3.29 , 2.73 , New , 0.56
> > 25 , 2 , , 26 3.3 , 2.76 , New , 0.54
> > 28 , 0 , , 27 2.75 , 2.79 , Cur , 0.04
> > 28 , 3 , , 27 2.77 , 2.77 , Eq , 0.0
> > 26 , 0 , , 27 3.28 , 2.78 , New , 0.5
> > 26 , 3 , , 27 3.29 , 2.78 , New , 0.51
> > 29 , 0 , , 28 2.74 , 2.76 , Cur , 0.02
> > 29 , 4 , , 28 2.74 , 2.77 , Cur , 0.03
> > 27 , 0 , , 28 3.3 , 2.76 , New , 0.54
> > 27 , 4 , , 28 3.3 , 2.74 , New , 0.56
> > 30 , 0 , , 29 2.72 , 2.76 , Cur , 0.04
> > 30 , 5 , , 29 2.74 , 2.75 , Cur , 0.01
> > 28 , 0 , , 29 3.25 , 2.73 , New , 0.52
> > 28 , 5 , , 29 3.3 , 2.73 , New , 0.57
> > 31 , 0 , , 30 2.73 , 2.77 , Cur , 0.04
> > 31 , 6 , , 30 2.74 , 2.76 , Cur , 0.02
> > 29 , 0 , , 30 3.25 , 2.73 , New , 0.52
> > 29 , 6 , , 30 3.26 , 2.74 , New , 0.52
> > 32 , 0 , , 31 2.73 , 2.74 , Cur , 0.01
> > 32 , 7 , , 31 2.73 , 2.75 , Cur , 0.02
> > 30 , 0 , , 31 3.24 , 2.72 , New , 0.52
> > 30 , 7 , , 31 3.24 , 2.72 , New , 0.52
> >
> > For memchr-avx2 the improvements are more modest though again near
> > universal. The improvement is most significant for medium sizes and
> > small sizes with pos > size. For small sizes with pos < size and large
> > sizes the two implementations perform roughly the same for large
> > sizes.
> >
> > Results For Tigerlake memchr-avx2
> > size , algn , Pos , Cur T , New T , Win , Dif
> > 2048 , 0 , , 32 6.15 , 6.27 , Cur , 0.12
> > 256 , 1 , , 64 6.21 , 6.03 , New , 0.18
> > 2048 , 0 , , 64 6.07 , 5.95 , New , 0.12
> > 256 , 2 , , 64 6.01 , 5.8 , New , 0.21
> > 2048 , 0 , , 128 7.05 , 6.55 , New , 0.5
> > 256 , 3 , , 64 6.14 , 5.83 , New , 0.31
> > 2048 , 0 , , 256 11.78 , 11.78 , Eq , 0.0
> > 256 , 4 , , 64 6.1 , 5.85 , New , 0.25
> > 2048 , 0 , , 512 16.32 , 15.96 , New , 0.36
> > 256 , 5 , , 64 6.1 , 5.77 , New , 0.33
> > 2048 , 0 , , 1024 25.38 , 25.18 , New , 0.2
> > 256 , 6 , , 64 6.08 , 5.88 , New , 0.2
> > 2048 , 0 , , 2048 38.56 , 38.32 , New , 0.24
> > 256 , 7 , , 64 5.93 , 5.68 , New , 0.25
> > 192 , 1 , , 32 5.49 , 5.3 , New , 0.19
> > 256 , 1 , , 32 5.5 , 5.28 , New , 0.22
> > 512 , 1 , , 32 5.48 , 5.32 , New , 0.16
> > 192 , 2 , , 64 6.1 , 5.73 , New , 0.37
> > 512 , 2 , , 64 5.88 , 5.72 , New , 0.16
> > 192 , 3 , , 96 6.31 , 5.93 , New , 0.38
> > 256 , 3 , , 96 6.32 , 5.93 , New , 0.39
> > 512 , 3 , , 96 6.2 , 5.94 , New , 0.26
> > 192 , 4 , , 128 6.65 , 6.4 , New , 0.25
> > 256 , 4 , , 128 6.6 , 6.37 , New , 0.23
> > 512 , 4 , , 128 6.74 , 6.33 , New , 0.41
> > 192 , 5 , , 160 7.78 , 7.4 , New , 0.38
> > 256 , 5 , , 160 7.18 , 7.4 , Cur , 0.22
> > 512 , 5 , , 160 9.81 , 9.44 , New , 0.37
> > 192 , 6 , , 192 9.12 , 7.77 , New , 1.35
> > 256 , 6 , , 192 7.97 , 7.66 , New , 0.31
> > 512 , 6 , , 192 10.14 , 9.95 , New , 0.19
> > 192 , 7 , , 224 8.96 , 7.78 , New , 1.18
> > 256 , 7 , , 224 8.52 , 8.23 , New , 0.29
> > 512 , 7 , , 224 10.33 , 9.98 , New , 0.35
> > 2 , 0 , , 1 3.61 , 3.6 , New , 0.01
> > 2 , 1 , , 1 3.6 , 3.6 , Eq , 0.0
> > 0 , 0 , , 1 3.02 , 3.0 , New , 0.02
> > 0 , 1 , , 1 3.0 , 3.0 , Eq , 0.0
> > 3 , 0 , , 2 3.6 , 3.6 , Eq , 0.0
> > 3 , 2 , , 2 3.61 , 3.6 , New , 0.01
> > 1 , 0 , , 2 4.82 , 3.6 , New , 1.22
> > 1 , 2 , , 2 4.81 , 3.6 , New , 1.21
> > 4 , 0 , , 3 3.61 , 3.61 , Eq , 0.0
> > 4 , 3 , , 3 3.62 , 3.61 , New , 0.01
> > 2 , 0 , , 3 4.82 , 3.62 , New , 1.2
> > 2 , 3 , , 3 4.83 , 3.63 , New , 1.2
> > 5 , 0 , , 4 3.63 , 3.64 , Cur , 0.01
> > 5 , 4 , , 4 3.63 , 3.62 , New , 0.01
> > 3 , 0 , , 4 4.84 , 3.62 , New , 1.22
> > 3 , 4 , , 4 4.84 , 3.64 , New , 1.2
> > 6 , 0 , , 5 3.66 , 3.64 , New , 0.02
> > 6 , 5 , , 5 3.65 , 3.62 , New , 0.03
> > 4 , 0 , , 5 4.83 , 3.63 , New , 1.2
> > 4 , 5 , , 5 4.85 , 3.64 , New , 1.21
> > 7 , 0 , , 6 3.76 , 3.79 , Cur , 0.03
> > 7 , 6 , , 6 3.76 , 3.72 , New , 0.04
> > 5 , 0 , , 6 4.84 , 3.62 , New , 1.22
> > 5 , 6 , , 6 4.85 , 3.64 , New , 1.21
> > 8 , 0 , , 7 3.64 , 3.65 , Cur , 0.01
> > 8 , 7 , , 7 3.65 , 3.65 , Eq , 0.0
> > 6 , 0 , , 7 4.88 , 3.64 , New , 1.24
> > 6 , 7 , , 7 4.87 , 3.65 , New , 1.22
> > 9 , 0 , , 8 3.66 , 3.66 , Eq , 0.0
> > 7 , 0 , , 8 4.89 , 3.66 , New , 1.23
> > 10 , 0 , , 9 3.67 , 3.67 , Eq , 0.0
> > 10 , 1 , , 9 3.67 , 3.67 , Eq , 0.0
> > 8 , 0 , , 9 4.9 , 3.67 , New , 1.23
> > 8 , 1 , , 9 4.9 , 3.67 , New , 1.23
> > 11 , 0 , , 10 3.68 , 3.67 , New , 0.01
> > 11 , 2 , , 10 3.69 , 3.67 , New , 0.02
> > 9 , 0 , , 10 4.9 , 3.67 , New , 1.23
> > 9 , 2 , , 10 4.9 , 3.67 , New , 1.23
> > 12 , 0 , , 11 3.71 , 3.68 , New , 0.03
> > 12 , 3 , , 11 3.71 , 3.67 , New , 0.04
> > 10 , 0 , , 11 4.9 , 3.67 , New , 1.23
> > 10 , 3 , , 11 4.9 , 3.67 , New , 1.23
> > 13 , 0 , , 12 4.24 , 4.23 , New , 0.01
> > 13 , 4 , , 12 4.23 , 4.23 , Eq , 0.0
> > 11 , 0 , , 12 4.9 , 3.7 , New , 1.2
> > 11 , 4 , , 12 4.9 , 3.73 , New , 1.17
> > 14 , 0 , , 13 3.99 , 4.01 , Cur , 0.02
> > 14 , 5 , , 13 3.98 , 3.98 , Eq , 0.0
> > 12 , 0 , , 13 4.9 , 3.69 , New , 1.21
> > 12 , 5 , , 13 4.9 , 3.69 , New , 1.21
> > 15 , 0 , , 14 3.99 , 3.97 , New , 0.02
> > 15 , 6 , , 14 4.0 , 4.0 , Eq , 0.0
> > 13 , 0 , , 14 4.9 , 3.67 , New , 1.23
> > 13 , 6 , , 14 4.9 , 3.67 , New , 1.23
> > 16 , 0 , , 15 3.99 , 4.02 , Cur , 0.03
> > 16 , 7 , , 15 4.01 , 3.96 , New , 0.05
> > 14 , 0 , , 15 4.93 , 3.67 , New , 1.26
> > 14 , 7 , , 15 4.92 , 3.67 , New , 1.25
> > 17 , 0 , , 16 4.04 , 3.99 , New , 0.05
> > 15 , 0 , , 16 5.42 , 4.22 , New , 1.2
> > 18 , 0 , , 17 4.01 , 3.97 , New , 0.04
> > 18 , 1 , , 17 3.99 , 3.98 , New , 0.01
> > 16 , 0 , , 17 5.22 , 3.98 , New , 1.24
> > 16 , 1 , , 17 5.19 , 3.98 , New , 1.21
> > 19 , 0 , , 18 4.0 , 3.99 , New , 0.01
> > 19 , 2 , , 18 4.03 , 3.97 , New , 0.06
> > 17 , 0 , , 18 5.18 , 3.99 , New , 1.19
> > 17 , 2 , , 18 5.18 , 3.98 , New , 1.2
> > 20 , 0 , , 19 4.02 , 3.98 , New , 0.04
> > 20 , 3 , , 19 4.0 , 3.98 , New , 0.02
> > 18 , 0 , , 19 5.19 , 3.97 , New , 1.22
> > 18 , 3 , , 19 5.21 , 3.98 , New , 1.23
> > 21 , 0 , , 20 3.98 , 4.0 , Cur , 0.02
> > 21 , 4 , , 20 4.0 , 4.0 , Eq , 0.0
> > 19 , 0 , , 20 5.19 , 3.99 , New , 1.2
> > 19 , 4 , , 20 5.17 , 3.99 , New , 1.18
> > 22 , 0 , , 21 4.03 , 3.98 , New , 0.05
> > 22 , 5 , , 21 4.01 , 3.95 , New , 0.06
> > 20 , 0 , , 21 5.19 , 4.0 , New , 1.19
> > 20 , 5 , , 21 5.21 , 3.99 , New , 1.22
> > 23 , 0 , , 22 4.06 , 3.97 , New , 0.09
> > 23 , 6 , , 22 4.02 , 3.98 , New , 0.04
> > 21 , 0 , , 22 5.2 , 4.02 , New , 1.18
> > 21 , 6 , , 22 5.22 , 4.0 , New , 1.22
> > 24 , 0 , , 23 4.15 , 3.98 , New , 0.17
> > 24 , 7 , , 23 4.0 , 4.01 , Cur , 0.01
> > 22 , 0 , , 23 5.28 , 4.0 , New , 1.28
> > 22 , 7 , , 23 5.22 , 3.99 , New , 1.23
> > 25 , 0 , , 24 4.1 , 4.04 , New , 0.06
> > 23 , 0 , , 24 5.23 , 4.04 , New , 1.19
> > 26 , 0 , , 25 4.1 , 4.06 , New , 0.04
> > 26 , 1 , , 25 4.07 , 3.99 , New , 0.08
> > 24 , 0 , , 25 5.26 , 4.02 , New , 1.24
> > 24 , 1 , , 25 5.21 , 4.0 , New , 1.21
> > 27 , 0 , , 26 4.17 , 4.03 , New , 0.14
> > 27 , 2 , , 26 4.09 , 4.03 , New , 0.06
> > 25 , 0 , , 26 5.29 , 4.1 , New , 1.19
> > 25 , 2 , , 26 5.25 , 4.0 , New , 1.25
> > 28 , 0 , , 27 4.06 , 4.1 , Cur , 0.04
> > 28 , 3 , , 27 4.09 , 4.04 , New , 0.05
> > 26 , 0 , , 27 5.26 , 4.04 , New , 1.22
> > 26 , 3 , , 27 5.28 , 4.01 , New , 1.27
> > 29 , 0 , , 28 4.07 , 4.02 , New , 0.05
> > 29 , 4 , , 28 4.07 , 4.05 , New , 0.02
> > 27 , 0 , , 28 5.25 , 4.02 , New , 1.23
> > 27 , 4 , , 28 5.25 , 4.03 , New , 1.22
> > 30 , 0 , , 29 4.14 , 4.06 , New , 0.08
> > 30 , 5 , , 29 4.08 , 4.04 , New , 0.04
> > 28 , 0 , , 29 5.26 , 4.07 , New , 1.19
> > 28 , 5 , , 29 5.28 , 4.04 , New , 1.24
> > 31 , 0 , , 30 4.09 , 4.08 , New , 0.01
> > 31 , 6 , , 30 4.1 , 4.08 , New , 0.02
> > 29 , 0 , , 30 5.28 , 4.05 , New , 1.23
> > 29 , 6 , , 30 5.24 , 4.07 , New , 1.17
> > 32 , 0 , , 31 4.1 , 4.13 , Cur , 0.03
> > 32 , 7 , , 31 4.16 , 4.09 , New , 0.07
> > 30 , 0 , , 31 5.31 , 4.09 , New , 1.22
> > 30 , 7 , , 31 5.28 , 4.08 , New , 1.2
> >
> > Results For Icelake memchr-avx2
> > size , algn , Pos , Cur T , New T , Win , Dif
> > 2048 , 0 , , 32 5.74 , 5.08 , New , 0.66
> > 256 , 1 , , 64 5.16 , 4.93 , New , 0.23
> > 2048 , 0 , , 64 4.86 , 4.69 , New , 0.17
> > 256 , 2 , , 64 4.78 , 4.7 , New , 0.08
> > 2048 , 0 , , 128 5.64 , 5.0 , New , 0.64
> > 256 , 3 , , 64 4.64 , 4.59 , New , 0.05
> > 2048 , 0 , , 256 9.07 , 9.17 , Cur , 0.1
> > 256 , 4 , , 64 4.7 , 4.6 , New , 0.1
> > 2048 , 0 , , 512 12.56 , 12.33 , New , 0.23
> > 256 , 5 , , 64 4.72 , 4.61 , New , 0.11
> > 2048 , 0 , , 1024 19.36 , 19.49 , Cur , 0.13
> > 256 , 6 , , 64 4.82 , 4.69 , New , 0.13
> > 2048 , 0 , , 2048 29.99 , 30.53 , Cur , 0.54
> > 256 , 7 , , 64 4.9 , 4.85 , New , 0.05
> > 192 , 1 , , 32 4.89 , 4.45 , New , 0.44
> > 256 , 1 , , 32 4.93 , 4.44 , New , 0.49
> > 512 , 1 , , 32 4.97 , 4.45 , New , 0.52
> > 192 , 2 , , 64 5.04 , 4.65 , New , 0.39
> > 512 , 2 , , 64 4.75 , 4.66 , New , 0.09
> > 192 , 3 , , 96 5.14 , 4.66 , New , 0.48
> > 256 , 3 , , 96 5.12 , 4.66 , New , 0.46
> > 512 , 3 , , 96 5.13 , 4.62 , New , 0.51
> > 192 , 4 , , 128 5.65 , 4.95 , New , 0.7
> > 256 , 4 , , 128 5.63 , 4.95 , New , 0.68
> > 512 , 4 , , 128 5.68 , 4.96 , New , 0.72
> > 192 , 5 , , 160 6.1 , 5.84 , New , 0.26
> > 256 , 5 , , 160 5.58 , 5.84 , Cur , 0.26
> > 512 , 5 , , 160 7.95 , 7.74 , New , 0.21
> > 192 , 6 , , 192 7.07 , 6.23 , New , 0.84
> > 256 , 6 , , 192 6.34 , 6.09 , New , 0.25
> > 512 , 6 , , 192 8.17 , 8.13 , New , 0.04
> > 192 , 7 , , 224 7.06 , 6.23 , New , 0.83
> > 256 , 7 , , 224 6.76 , 6.65 , New , 0.11
> > 512 , 7 , , 224 8.29 , 8.08 , New , 0.21
> > 2 , 0 , , 1 3.0 , 3.04 , Cur , 0.04
> > 2 , 1 , , 1 3.06 , 3.07 , Cur , 0.01
> > 0 , 0 , , 1 2.57 , 2.59 , Cur , 0.02
> > 0 , 1 , , 1 2.6 , 2.61 , Cur , 0.01
> > 3 , 0 , , 2 3.15 , 3.17 , Cur , 0.02
> > 3 , 2 , , 2 3.19 , 3.21 , Cur , 0.02
> > 1 , 0 , , 2 4.32 , 3.25 , New , 1.07
> > 1 , 2 , , 2 4.36 , 3.31 , New , 1.05
> > 4 , 0 , , 3 3.5 , 3.52 , Cur , 0.02
> > 4 , 3 , , 3 3.52 , 3.54 , Cur , 0.02
> > 2 , 0 , , 3 4.51 , 3.43 , New , 1.08
> > 2 , 3 , , 3 4.56 , 3.47 , New , 1.09
> > 5 , 0 , , 4 3.61 , 3.65 , Cur , 0.04
> > 5 , 4 , , 4 3.63 , 3.67 , Cur , 0.04
> > 3 , 0 , , 4 4.64 , 3.51 , New , 1.13
> > 3 , 4 , , 4 4.7 , 3.51 , New , 1.19
> > 6 , 0 , , 5 3.66 , 3.68 , Cur , 0.02
> > 6 , 5 , , 5 3.69 , 3.65 , New , 0.04
> > 4 , 0 , , 5 4.7 , 3.49 , New , 1.21
> > 4 , 5 , , 5 4.58 , 3.48 , New , 1.1
> > 7 , 0 , , 6 3.6 , 3.65 , Cur , 0.05
> > 7 , 6 , , 6 3.59 , 3.64 , Cur , 0.05
> > 5 , 0 , , 6 4.74 , 3.65 , New , 1.09
> > 5 , 6 , , 6 4.73 , 3.64 , New , 1.09
> > 8 , 0 , , 7 3.6 , 3.61 , Cur , 0.01
> > 8 , 7 , , 7 3.6 , 3.61 , Cur , 0.01
> > 6 , 0 , , 7 4.73 , 3.6 , New , 1.13
> > 6 , 7 , , 7 4.73 , 3.62 , New , 1.11
> > 9 , 0 , , 8 3.59 , 3.62 , Cur , 0.03
> > 7 , 0 , , 8 4.72 , 3.64 , New , 1.08
> > 10 , 0 , , 9 3.57 , 3.62 , Cur , 0.05
> > 10 , 1 , , 9 3.56 , 3.61 , Cur , 0.05
> > 8 , 0 , , 9 4.69 , 3.63 , New , 1.06
> > 8 , 1 , , 9 4.71 , 3.61 , New , 1.1
> > 11 , 0 , , 10 3.58 , 3.62 , Cur , 0.04
> > 11 , 2 , , 10 3.59 , 3.63 , Cur , 0.04
> > 9 , 0 , , 10 4.72 , 3.61 , New , 1.11
> > 9 , 2 , , 10 4.7 , 3.61 , New , 1.09
> > 12 , 0 , , 11 3.58 , 3.63 , Cur , 0.05
> > 12 , 3 , , 11 3.58 , 3.62 , Cur , 0.04
> > 10 , 0 , , 11 4.7 , 3.6 , New , 1.1
> > 10 , 3 , , 11 4.73 , 3.64 , New , 1.09
> > 13 , 0 , , 12 3.6 , 3.6 , Eq , 0.0
> > 13 , 4 , , 12 3.57 , 3.62 , Cur , 0.05
> > 11 , 0 , , 12 4.73 , 3.62 , New , 1.11
> > 11 , 4 , , 12 4.79 , 3.61 , New , 1.18
> > 14 , 0 , , 13 3.61 , 3.62 , Cur , 0.01
> > 14 , 5 , , 13 3.59 , 3.59 , Eq , 0.0
> > 12 , 0 , , 13 4.7 , 3.61 , New , 1.09
> > 12 , 5 , , 13 4.75 , 3.58 , New , 1.17
> > 15 , 0 , , 14 3.58 , 3.62 , Cur , 0.04
> > 15 , 6 , , 14 3.59 , 3.62 , Cur , 0.03
> > 13 , 0 , , 14 4.68 , 3.6 , New , 1.08
> > 13 , 6 , , 14 4.68 , 3.63 , New , 1.05
> > 16 , 0 , , 15 3.57 , 3.6 , Cur , 0.03
> > 16 , 7 , , 15 3.55 , 3.59 , Cur , 0.04
> > 14 , 0 , , 15 4.69 , 3.61 , New , 1.08
> > 14 , 7 , , 15 4.69 , 3.61 , New , 1.08
> > 17 , 0 , , 16 3.56 , 3.61 , Cur , 0.05
> > 15 , 0 , , 16 4.71 , 3.58 , New , 1.13
> > 18 , 0 , , 17 3.57 , 3.65 , Cur , 0.08
> > 18 , 1 , , 17 3.58 , 3.59 , Cur , 0.01
> > 16 , 0 , , 17 4.7 , 3.58 , New , 1.12
> > 16 , 1 , , 17 4.68 , 3.59 , New , 1.09
> > 19 , 0 , , 18 3.51 , 3.58 , Cur , 0.07
> > 19 , 2 , , 18 3.55 , 3.58 , Cur , 0.03
> > 17 , 0 , , 18 4.69 , 3.61 , New , 1.08
> > 17 , 2 , , 18 4.68 , 3.61 , New , 1.07
> > 20 , 0 , , 19 3.57 , 3.6 , Cur , 0.03
> > 20 , 3 , , 19 3.59 , 3.59 , Eq , 0.0
> > 18 , 0 , , 19 4.68 , 3.59 , New , 1.09
> > 18 , 3 , , 19 4.67 , 3.57 , New , 1.1
> > 21 , 0 , , 20 3.61 , 3.58 , New , 0.03
> > 21 , 4 , , 20 3.62 , 3.6 , New , 0.02
> > 19 , 0 , , 20 4.74 , 3.57 , New , 1.17
> > 19 , 4 , , 20 4.69 , 3.7 , New , 0.99
> > 22 , 0 , , 21 3.57 , 3.64 , Cur , 0.07
> > 22 , 5 , , 21 3.55 , 3.6 , Cur , 0.05
> > 20 , 0 , , 21 4.72 , 3.55 , New , 1.17
> > 20 , 5 , , 21 4.66 , 3.55 , New , 1.11
> > 23 , 0 , , 22 3.56 , 3.56 , Eq , 0.0
> > 23 , 6 , , 22 3.54 , 3.56 , Cur , 0.02
> > 21 , 0 , , 22 4.65 , 3.53 , New , 1.12
> > 21 , 6 , , 22 4.62 , 3.56 , New , 1.06
> > 24 , 0 , , 23 3.5 , 3.54 , Cur , 0.04
> > 24 , 7 , , 23 3.52 , 3.53 , Cur , 0.01
> > 22 , 0 , , 23 4.61 , 3.51 , New , 1.1
> > 22 , 7 , , 23 4.6 , 3.51 , New , 1.09
> > 25 , 0 , , 24 3.5 , 3.53 , Cur , 0.03
> > 23 , 0 , , 24 4.54 , 3.5 , New , 1.04
> > 26 , 0 , , 25 3.47 , 3.49 , Cur , 0.02
> > 26 , 1 , , 25 3.46 , 3.51 , Cur , 0.05
> > 24 , 0 , , 25 4.53 , 3.51 , New , 1.02
> > 24 , 1 , , 25 4.51 , 3.51 , New , 1.0
> > 27 , 0 , , 26 3.44 , 3.51 , Cur , 0.07
> > 27 , 2 , , 26 3.51 , 3.52 , Cur , 0.01
> > 25 , 0 , , 26 4.56 , 3.46 , New , 1.1
> > 25 , 2 , , 26 4.55 , 3.47 , New , 1.08
> > 28 , 0 , , 27 3.47 , 3.5 , Cur , 0.03
> > 28 , 3 , , 27 3.48 , 3.47 , New , 0.01
> > 26 , 0 , , 27 4.52 , 3.44 , New , 1.08
> > 26 , 3 , , 27 4.55 , 3.46 , New , 1.09
> > 29 , 0 , , 28 3.45 , 3.49 , Cur , 0.04
> > 29 , 4 , , 28 3.5 , 3.5 , Eq , 0.0
> > 27 , 0 , , 28 4.56 , 3.49 , New , 1.07
> > 27 , 4 , , 28 4.5 , 3.49 , New , 1.01
> > 30 , 0 , , 29 3.44 , 3.48 , Cur , 0.04
> > 30 , 5 , , 29 3.46 , 3.47 , Cur , 0.01
> > 28 , 0 , , 29 4.49 , 3.43 , New , 1.06
> > 28 , 5 , , 29 4.57 , 3.45 , New , 1.12
> > 31 , 0 , , 30 3.48 , 3.48 , Eq , 0.0
> > 31 , 6 , , 30 3.46 , 3.49 , Cur , 0.03
> > 29 , 0 , , 30 4.49 , 3.44 , New , 1.05
> > 29 , 6 , , 30 4.53 , 3.44 , New , 1.09
> > 32 , 0 , , 31 3.44 , 3.45 , Cur , 0.01
> > 32 , 7 , , 31 3.46 , 3.51 , Cur , 0.05
> > 30 , 0 , , 31 4.48 , 3.42 , New , 1.06
> > 30 , 7 , , 31 4.48 , 3.44 , New , 1.04
> >
> >
> > Results For Skylake memchr-avx2
> > size , algn , Pos , Cur T , New T , Win , Dif
> > 2048 , 0 , , 32 6.61 , 5.4 , New , 1.21
> > 256 , 1 , , 64 6.52 , 5.68 , New , 0.84
> > 2048 , 0 , , 64 6.03 , 5.47 , New , 0.56
> > 256 , 2 , , 64 6.07 , 5.42 , New , 0.65
> > 2048 , 0 , , 128 7.01 , 5.83 , New , 1.18
> > 256 , 3 , , 64 6.24 , 5.68 , New , 0.56
> > 2048 , 0 , , 256 11.03 , 9.86 , New , 1.17
> > 256 , 4 , , 64 6.17 , 5.49 , New , 0.68
> > 2048 , 0 , , 512 14.11 , 13.41 , New , 0.7
> > 256 , 5 , , 64 6.03 , 5.45 , New , 0.58
> > 2048 , 0 , , 1024 19.82 , 19.92 , Cur , 0.1
> > 256 , 6 , , 64 6.14 , 5.7 , New , 0.44
> > 2048 , 0 , , 2048 30.9 , 30.59 , New , 0.31
> > 256 , 7 , , 64 6.05 , 5.64 , New , 0.41
> > 192 , 1 , , 32 5.6 , 4.89 , New , 0.71
> > 256 , 1 , , 32 5.59 , 5.07 , New , 0.52
> > 512 , 1 , , 32 5.58 , 4.93 , New , 0.65
> > 192 , 2 , , 64 6.14 , 5.46 , New , 0.68
> > 512 , 2 , , 64 5.95 , 5.38 , New , 0.57
> > 192 , 3 , , 96 6.6 , 5.74 , New , 0.86
> > 256 , 3 , , 96 6.48 , 5.37 , New , 1.11
> > 512 , 3 , , 96 6.56 , 5.44 , New , 1.12
> > 192 , 4 , , 128 7.04 , 6.02 , New , 1.02
> > 256 , 4 , , 128 6.96 , 5.89 , New , 1.07
> > 512 , 4 , , 128 6.97 , 5.99 , New , 0.98
> > 192 , 5 , , 160 8.49 , 7.07 , New , 1.42
> > 256 , 5 , , 160 8.1 , 6.96 , New , 1.14
> > 512 , 5 , , 160 10.48 , 9.14 , New , 1.34
> > 192 , 6 , , 192 8.46 , 8.52 , Cur , 0.06
> > 256 , 6 , , 192 8.53 , 7.58 , New , 0.95
> > 512 , 6 , , 192 10.88 , 9.06 , New , 1.82
> > 192 , 7 , , 224 8.59 , 8.35 , New , 0.24
> > 256 , 7 , , 224 8.86 , 7.91 , New , 0.95
> > 512 , 7 , , 224 10.89 , 8.98 , New , 1.91
> > 2 , 0 , , 1 4.28 , 3.62 , New , 0.66
> > 2 , 1 , , 1 4.32 , 3.75 , New , 0.57
> > 0 , 0 , , 1 3.76 , 3.24 , New , 0.52
> > 0 , 1 , , 1 3.7 , 3.19 , New , 0.51
> > 3 , 0 , , 2 4.16 , 3.67 , New , 0.49
> > 3 , 2 , , 2 4.21 , 3.68 , New , 0.53
> > 1 , 0 , , 2 4.25 , 3.74 , New , 0.51
> > 1 , 2 , , 2 4.4 , 3.82 , New , 0.58
> > 4 , 0 , , 3 4.43 , 3.88 , New , 0.55
> > 4 , 3 , , 3 4.34 , 3.8 , New , 0.54
> > 2 , 0 , , 3 4.33 , 3.79 , New , 0.54
> > 2 , 3 , , 3 4.37 , 3.84 , New , 0.53
> > 5 , 0 , , 4 4.45 , 3.87 , New , 0.58
> > 5 , 4 , , 4 4.41 , 3.84 , New , 0.57
> > 3 , 0 , , 4 4.34 , 3.83 , New , 0.51
> > 3 , 4 , , 4 4.35 , 3.82 , New , 0.53
> > 6 , 0 , , 5 4.41 , 3.88 , New , 0.53
> > 6 , 5 , , 5 4.41 , 3.88 , New , 0.53
> > 4 , 0 , , 5 4.35 , 3.84 , New , 0.51
> > 4 , 5 , , 5 4.37 , 3.85 , New , 0.52
> > 7 , 0 , , 6 4.4 , 3.84 , New , 0.56
> > 7 , 6 , , 6 4.39 , 3.83 , New , 0.56
> > 5 , 0 , , 6 4.37 , 3.85 , New , 0.52
> > 5 , 6 , , 6 4.4 , 3.86 , New , 0.54
> > 8 , 0 , , 7 4.39 , 3.88 , New , 0.51
> > 8 , 7 , , 7 4.4 , 3.83 , New , 0.57
> > 6 , 0 , , 7 4.39 , 3.85 , New , 0.54
> > 6 , 7 , , 7 4.38 , 3.87 , New , 0.51
> > 9 , 0 , , 8 4.47 , 3.96 , New , 0.51
> > 7 , 0 , , 8 4.37 , 3.85 , New , 0.52
> > 10 , 0 , , 9 4.61 , 4.08 , New , 0.53
> > 10 , 1 , , 9 4.61 , 4.09 , New , 0.52
> > 8 , 0 , , 9 4.37 , 3.85 , New , 0.52
> > 8 , 1 , , 9 4.37 , 3.85 , New , 0.52
> > 11 , 0 , , 10 4.68 , 4.06 , New , 0.62
> > 11 , 2 , , 10 4.56 , 4.1 , New , 0.46
> > 9 , 0 , , 10 4.36 , 3.83 , New , 0.53
> > 9 , 2 , , 10 4.37 , 3.83 , New , 0.54
> > 12 , 0 , , 11 4.62 , 4.05 , New , 0.57
> > 12 , 3 , , 11 4.63 , 4.06 , New , 0.57
> > 10 , 0 , , 11 4.38 , 3.86 , New , 0.52
> > 10 , 3 , , 11 4.41 , 3.86 , New , 0.55
> > 13 , 0 , , 12 4.57 , 4.08 , New , 0.49
> > 13 , 4 , , 12 4.59 , 4.12 , New , 0.47
> > 11 , 0 , , 12 4.45 , 4.0 , New , 0.45
> > 11 , 4 , , 12 4.51 , 4.04 , New , 0.47
> > 14 , 0 , , 13 4.64 , 4.16 , New , 0.48
> > 14 , 5 , , 13 4.67 , 4.1 , New , 0.57
> > 12 , 0 , , 13 4.58 , 4.08 , New , 0.5
> > 12 , 5 , , 13 4.6 , 4.1 , New , 0.5
> > 15 , 0 , , 14 4.61 , 4.05 , New , 0.56
> > 15 , 6 , , 14 4.59 , 4.06 , New , 0.53
> > 13 , 0 , , 14 4.57 , 4.06 , New , 0.51
> > 13 , 6 , , 14 4.57 , 4.05 , New , 0.52
> > 16 , 0 , , 15 4.62 , 4.05 , New , 0.57
> > 16 , 7 , , 15 4.63 , 4.06 , New , 0.57
> > 14 , 0 , , 15 4.61 , 4.06 , New , 0.55
> > 14 , 7 , , 15 4.59 , 4.05 , New , 0.54
> > 17 , 0 , , 16 4.58 , 4.08 , New , 0.5
> > 15 , 0 , , 16 4.64 , 4.06 , New , 0.58
> > 18 , 0 , , 17 4.56 , 4.17 , New , 0.39
> > 18 , 1 , , 17 4.59 , 4.09 , New , 0.5
> > 16 , 0 , , 17 4.59 , 4.07 , New , 0.52
> > 16 , 1 , , 17 4.58 , 4.04 , New , 0.54
> > 19 , 0 , , 18 4.61 , 4.05 , New , 0.56
> > 19 , 2 , , 18 4.6 , 4.08 , New , 0.52
> > 17 , 0 , , 18 4.64 , 4.11 , New , 0.53
> > 17 , 2 , , 18 4.56 , 4.13 , New , 0.43
> > 20 , 0 , , 19 4.77 , 4.3 , New , 0.47
> > 20 , 3 , , 19 4.6 , 4.14 , New , 0.46
> > 18 , 0 , , 19 4.72 , 4.02 , New , 0.7
> > 18 , 3 , , 19 4.53 , 4.01 , New , 0.52
> > 21 , 0 , , 20 4.66 , 4.26 , New , 0.4
> > 21 , 4 , , 20 4.74 , 4.07 , New , 0.67
> > 19 , 0 , , 20 4.62 , 4.12 , New , 0.5
> > 19 , 4 , , 20 4.57 , 4.04 , New , 0.53
> > 22 , 0 , , 21 4.61 , 4.13 , New , 0.48
> > 22 , 5 , , 21 4.64 , 4.08 , New , 0.56
> > 20 , 0 , , 21 4.49 , 4.01 , New , 0.48
> > 20 , 5 , , 21 4.58 , 4.06 , New , 0.52
> > 23 , 0 , , 22 4.62 , 4.13 , New , 0.49
> > 23 , 6 , , 22 4.72 , 4.27 , New , 0.45
> > 21 , 0 , , 22 4.65 , 3.97 , New , 0.68
> > 21 , 6 , , 22 4.5 , 4.02 , New , 0.48
> > 24 , 0 , , 23 4.78 , 4.07 , New , 0.71
> > 24 , 7 , , 23 4.67 , 4.23 , New , 0.44
> > 22 , 0 , , 23 4.49 , 3.99 , New , 0.5
> > 22 , 7 , , 23 4.56 , 4.03 , New , 0.53
> > 25 , 0 , , 24 4.6 , 4.15 , New , 0.45
> > 23 , 0 , , 24 4.57 , 4.06 , New , 0.51
> > 26 , 0 , , 25 4.54 , 4.14 , New , 0.4
> > 26 , 1 , , 25 4.72 , 4.1 , New , 0.62
> > 24 , 0 , , 25 4.52 , 4.13 , New , 0.39
> > 24 , 1 , , 25 4.55 , 4.0 , New , 0.55
> > 27 , 0 , , 26 4.51 , 4.06 , New , 0.45
> > 27 , 2 , , 26 4.53 , 4.16 , New , 0.37
> > 25 , 0 , , 26 4.59 , 4.09 , New , 0.5
> > 25 , 2 , , 26 4.55 , 4.01 , New , 0.54
> > 28 , 0 , , 27 4.59 , 3.99 , New , 0.6
> > 28 , 3 , , 27 4.57 , 3.95 , New , 0.62
> > 26 , 0 , , 27 4.55 , 4.15 , New , 0.4
> > 26 , 3 , , 27 4.57 , 3.99 , New , 0.58
> > 29 , 0 , , 28 4.41 , 4.03 , New , 0.38
> > 29 , 4 , , 28 4.59 , 4.02 , New , 0.57
> > 27 , 0 , , 28 4.63 , 4.08 , New , 0.55
> > 27 , 4 , , 28 4.44 , 4.02 , New , 0.42
> > 30 , 0 , , 29 4.53 , 3.93 , New , 0.6
> > 30 , 5 , , 29 4.55 , 3.88 , New , 0.67
> > 28 , 0 , , 29 4.49 , 3.9 , New , 0.59
> > 28 , 5 , , 29 4.44 , 3.94 , New , 0.5
> > 31 , 0 , , 30 4.41 , 3.85 , New , 0.56
> > 31 , 6 , , 30 4.48 , 3.86 , New , 0.62
> > 29 , 0 , , 30 4.55 , 3.94 , New , 0.61
> > 29 , 6 , , 30 4.32 , 3.95 , New , 0.37
> > 32 , 0 , , 31 4.36 , 3.91 , New , 0.45
> > 32 , 7 , , 31 4.37 , 3.89 , New , 0.48
> > 30 , 0 , , 31 4.65 , 3.9 , New , 0.75
> > 30 , 7 , , 31 4.42 , 3.93 , New , 0.49
> >
> > sysdeps/x86_64/multiarch/memchr-evex.S | 580 +++++++++++++++----------
> > 1 file changed, 349 insertions(+), 231 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> > index 6dd5d67b90..65c16ef8a4 100644
> > --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> > +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> > @@ -26,14 +26,28 @@
> >
> > # ifdef USE_AS_WMEMCHR
> > # define VPBROADCAST vpbroadcastd
> > -# define VPCMP vpcmpd
> > -# define SHIFT_REG r8d
> > +# define VPMINU vpminud
> > +# define VPCMP vpcmpd
> > +# define VPCMPEQ vpcmpeqd
> > +# define CHAR_SIZE 4
> > # else
> > # define VPBROADCAST vpbroadcastb
> > -# define VPCMP vpcmpb
> > -# define SHIFT_REG ecx
> > +# define VPMINU vpminub
> > +# define VPCMP vpcmpb
> > +# define VPCMPEQ vpcmpeqb
> > +# define CHAR_SIZE 1
> > # endif
> >
> > +# ifdef USE_AS_RAWMEMCHR
> > +# define RAW_PTR_REG rcx
> > +# define ALGN_PTR_REG rdi
> > +# else
> > +# define RAW_PTR_REG rdi
> > +# define ALGN_PTR_REG rcx
> > +# endif
> > +
> > +#define XZERO xmm23
>
> Add a space before define. Rename XZERO to XMMZERO.
Done.
>
> > +#define YZERO ymm23
>
> Add a space before define. Rename YZERO to YMMZERO.
Done.
>
> > # define XMMMATCH xmm16
> > # define YMMMATCH ymm16
> > # define YMM1 ymm17
> > @@ -44,18 +58,16 @@
> > # define YMM6 ymm22
> >
> > # define VEC_SIZE 32
> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> > +# define PAGE_SIZE 4096
> >
> > .section .text.evex,"ax",@progbits
> > -ENTRY (MEMCHR)
> > +ENTRY(MEMCHR)
>
> No need for this change.
Fixed.
>
> > # ifndef USE_AS_RAWMEMCHR
> > /* Check for zero length. */
> > test %RDX_LP, %RDX_LP
> > jz L(zero)
> > -# endif
> > - movl %edi, %ecx
> > -# ifdef USE_AS_WMEMCHR
> > - shl $2, %RDX_LP
> > -# else
> > +
> > # ifdef __ILP32__
> > /* Clear the upper 32 bits. */
> > movl %edx, %edx
> > @@ -63,319 +75,425 @@ ENTRY (MEMCHR)
> > # endif
> > /* Broadcast CHAR to YMMMATCH. */
> > VPBROADCAST %esi, %YMMMATCH
> > - /* Check if we may cross page boundary with one vector load. */
> > - andl $(2 * VEC_SIZE - 1), %ecx
> > - cmpl $VEC_SIZE, %ecx
> > - ja L(cros_page_boundary)
> > + /* Check if we may cross page boundary with one
> > + vector load. */
>
> Fit comments to 72 columns.
Fixed.
>
> > + movl %edi, %eax
> > + andl $(PAGE_SIZE - 1), %eax
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > + ja L(cross_page_boundary)
> >
> > /* Check the first VEC_SIZE bytes. */
> > - VPCMP $0, (%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > -
> > + VPCMP $0, (%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > # ifndef USE_AS_RAWMEMCHR
> > - jnz L(first_vec_x0_check)
> > - /* Adjust length and check the end of data. */
> > - subq $VEC_SIZE, %rdx
> > - jbe L(zero)
> > + /* If length < CHAR_PER_VEC handle special. */
> > + cmpq $CHAR_PER_VEC, %rdx
> > + jbe L(first_vec_x0)
> > +# endif
> > + testl %eax, %eax
> > + jz L(aligned_more)
> > + tzcntl %eax, %eax
> > +# ifdef USE_AS_WMEMCHR
> > + /* NB: Multiply bytes by CHAR_SIZE to get the
> > + wchar_t count. */
>
> Fit comments to 72 columns.
Fixed.
>
> > + leaq (%rdi, %rax, CHAR_SIZE), %rax
> > # else
> > - jnz L(first_vec_x0)
> > + addq %rdi, %rax
> > # endif
> > -
> > - /* Align data for aligned loads in the loop. */
> > - addq $VEC_SIZE, %rdi
> > - andl $(VEC_SIZE - 1), %ecx
> > - andq $-VEC_SIZE, %rdi
> > + ret
> >
> > # ifndef USE_AS_RAWMEMCHR
> > - /* Adjust length. */
> > - addq %rcx, %rdx
> > -
> > - subq $(VEC_SIZE * 4), %rdx
> > - jbe L(last_4x_vec_or_less)
> > -# endif
> > - jmp L(more_4x_vec)
> > +L(zero):
> > + xorl %eax, %eax
> > + ret
> >
> > + .p2align 5
> > +L(first_vec_x0):
> > + /* Check if first match was before length. */
> > + tzcntl %eax, %eax
> > + xorl %ecx, %ecx
> > + cmpl %eax, %edx
> > + leaq (%rdi, %rax, CHAR_SIZE), %rax
> > + cmovle %rcx, %rax
> > + ret
> > +# else
> > + /* NB: first_vec_x0 is 17 bytes which will leave
> > + cross_page_boundary (which is relatively cold) close
> > + enough to ideal alignment. So only realign
> > + L(cross_page_boundary) if rawmemchr. */
>
> Fit comments to 72 columns.
Fixed.
>
> > .p2align 4
> > -L(cros_page_boundary):
> > - andl $(VEC_SIZE - 1), %ecx
> > +# endif
> > +L(cross_page_boundary):
> > + /* Save pointer before aligning as its original
> > + value is necessary for computer return address if byte is
> > + found or adjusting length if it is not and this is
> > + memchr. */
>
> Fit comments to 72 columns.
Fixed.
>
> > + movq %rdi, %rcx
> > + /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx
> > + for memchr and rdi for rawmemchr. */
>
> Fit comments to 72 columns.
Fixed.
>
> > + andq $-VEC_SIZE, %ALGN_PTR_REG
> > + VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
> > + kmovd %k0, %r8d
> > # ifdef USE_AS_WMEMCHR
> > - /* NB: Divide shift count by 4 since each bit in K1 represent 4
> > - bytes. */
> > - movl %ecx, %SHIFT_REG
> > - sarl $2, %SHIFT_REG
> > + /* NB: Divide shift count by 4 since each bit in
> > + K0 represent 4 bytes. */
> > + sarl $2, %eax
> > +# endif
> > +# ifndef USE_AS_RAWMEMCHR
> > + movl $(PAGE_SIZE / CHAR_SIZE), %esi
> > + subl %eax, %esi
> > # endif
> > - andq $-VEC_SIZE, %rdi
> > - VPCMP $0, (%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > - /* Remove the leading bytes. */
> > - sarxl %SHIFT_REG, %eax, %eax
> > - testl %eax, %eax
> > - jz L(aligned_more)
> > - tzcntl %eax, %eax
> > # ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - sall $2, %eax
> > + andl $(CHAR_PER_VEC - 1), %eax
> > # endif
> > + /* Remove the leading bytes. */
> > + sarxl %eax, %r8d, %eax
> > # ifndef USE_AS_RAWMEMCHR
> > /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > + cmpq %rsi, %rdx
> > + jbe L(first_vec_x0)
> > +# endif
> > + testl %eax, %eax
> > + jz L(cross_page_continue)
> > + tzcntl %eax, %eax
> > +# ifdef USE_AS_WMEMCHR
> > + /* NB: Multiply bytes by CHAR_SIZE to get the
> > + wchar_t count. */
> > + leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
> > +# else
> > + addq %RAW_PTR_REG, %rax
> > # endif
> > - addq %rdi, %rax
> > - addq %rcx, %rax
> > ret
> >
> > .p2align 4
> > -L(aligned_more):
> > -# ifndef USE_AS_RAWMEMCHR
> > - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> > - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> > - overflow. */
> > - negq %rcx
> > - addq $VEC_SIZE, %rcx
> > +L(first_vec_x1):
> > + tzcntl %eax, %eax
> > + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> >
> > - /* Check the end of data. */
> > - subq %rcx, %rdx
> > - jbe L(zero)
> > -# endif
> > + .p2align 4
> > +L(first_vec_x2):
> > + tzcntl %eax, %eax
> > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> >
> > - addq $VEC_SIZE, %rdi
> > + .p2align 4
> > +L(first_vec_x3):
> > + tzcntl %eax, %eax
> > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> > +
> > + .p2align 4
> > +L(first_vec_x4):
> > + tzcntl %eax, %eax
> > + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> > +
> > + .p2align 5
> > +L(aligned_more):
> > + /* Check the first 4 * VEC_SIZE. Only one
> > + VEC_SIZE at a time since data is only aligned to
> > + VEC_SIZE. */
>
> Fit comments to 72 columns.
Fixed.
>
> >
> > # ifndef USE_AS_RAWMEMCHR
> > - subq $(VEC_SIZE * 4), %rdx
> > + /* Align data to VEC_SIZE. */
> > +L(cross_page_continue):
> > + xorl %ecx, %ecx
> > + subl %edi, %ecx
> > + andq $-VEC_SIZE, %rdi
> > + /* esi is for adjusting length to see if near the
> > + end. */
>
> Fit comments to 72 columns.
Fixed.
>
> > + leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
> > +# ifdef USE_AS_WMEMCHR
> > + /* NB: Divide bytes by 4 to get the wchar_t
> > + count. */
> > + sarl $2, %esi
> > +# endif
> > +# else
> > + andq $-VEC_SIZE, %rdi
> > +L(cross_page_continue):
> > +# endif
> > + /* Load first VEC regardless. */
> > + VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > + /* Adjust length. If near end handle specially.
> > + */
>
> Fit comments to 72 columns.
Fixed.
>
> > + subq %rsi, %rdx
> > jbe L(last_4x_vec_or_less)
> > # endif
> > -
> > -L(more_4x_vec):
> > - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > - since data is only aligned to VEC_SIZE. */
> > - VPCMP $0, (%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x0)
> > -
> > - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x1)
> >
> > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x2)
> >
> > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x3)
> >
> > - addq $(VEC_SIZE * 4), %rdi
> > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > + testl %eax, %eax
> > + jnz L(first_vec_x4)
> > +
> >
> > # ifndef USE_AS_RAWMEMCHR
> > - subq $(VEC_SIZE * 4), %rdx
> > - jbe L(last_4x_vec_or_less)
> > -# endif
> > + /* Check if at last CHAR_PER_VEC * 4 length. */
> > + subq $(CHAR_PER_VEC * 4), %rdx
> > + jbe L(last_4x_vec_or_less_cmpeq)
> > + addq $VEC_SIZE, %rdi
> >
> > - /* Align data to 4 * VEC_SIZE. */
> > - movq %rdi, %rcx
> > - andl $(4 * VEC_SIZE - 1), %ecx
> > + /* Align data to VEC_SIZE * 4 for the loop and
> > + readjust length. */
>
> Fit comments to 72 columns.
Fixed.
>
> > +# ifdef USE_AS_WMEMCHR
> > + movl %edi, %ecx
> > andq $-(4 * VEC_SIZE), %rdi
> > -
> > -# ifndef USE_AS_RAWMEMCHR
> > - /* Adjust length. */
> > + andl $(VEC_SIZE * 4 - 1), %ecx
> > + /* NB: Divide bytes by 4 to get the wchar_t
> > + count. */
>
> Fit comments to 72 columns.
Fixed.
>
> > + sarl $2, %ecx
> > addq %rcx, %rdx
> > +# else
> > + addq %rdi, %rdx
> > + andq $-(4 * VEC_SIZE), %rdi
> > + subq %rdi, %rdx
> > +# endif
> > +# else
> > + addq $VEC_SIZE, %rdi
> > + andq $-(4 * VEC_SIZE), %rdi
> > # endif
> >
> > + vpxorq %XZERO, %XZERO, %XZERO
> > +
> > + /* Compare 4 * VEC at a time forward. */
> > .p2align 4
> > L(loop_4x_vec):
> > - /* Compare 4 * VEC at a time forward. */
> > - VPCMP $0, (%rdi), %YMMMATCH, %k1
> > - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> > - kord %k1, %k2, %k5
> > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> > -
> > - kord %k3, %k4, %k6
> > - kortestd %k5, %k6
> > - jnz L(4x_vec_end)
> > -
> > - addq $(VEC_SIZE * 4), %rdi
> > -
> > + /* It would be possible to save some instructions
> > + using 4x VPCMP but bottleneck on port 5 makes it not woth
> > + it. */
>
> Fit comments to 72 columns.
Fixed.
>
> > + VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
> > + /* xor will set bytes match esi to zero. */
> > + vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
> > + vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
> > + VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
> > + /* Reduce VEC2 / VEC3 with min and VEC1 with zero
> > + mask. */
>
> Fit comments to 72 columns.
Fixed.
>
> > + VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
> > + VPCMP $0, %YMM3, %YZERO, %k2
> > # ifdef USE_AS_RAWMEMCHR
> > - jmp L(loop_4x_vec)
> > + subq $-(VEC_SIZE * 4), %rdi
> > + kortestd %k2, %k3
> > + jz L(loop_4x_vec)
> > # else
> > - subq $(VEC_SIZE * 4), %rdx
> > - ja L(loop_4x_vec)
> > + kortestd %k2, %k3
> > + jnz L(loop_4x_vec_end)
> >
> > -L(last_4x_vec_or_less):
> > - /* Less than 4 * VEC and aligned to VEC_SIZE. */
> > - addl $(VEC_SIZE * 2), %edx
> > - jle L(last_2x_vec)
> > + subq $-(VEC_SIZE * 4), %rdi
> >
> > - VPCMP $0, (%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x0)
> > + subq $(CHAR_PER_VEC * 4), %rdx
> > + ja L(loop_4x_vec)
> >
> > - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > + /* Fall through into less than 4 remaining
> > + vectors of length case. */
>
> Fit comments to 72 columns.
Fixed.
>
> > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > + addq $(VEC_SIZE * 3), %rdi
> > + .p2align 4
> > +L(last_4x_vec_or_less):
> > + /* Check if first VEC contained match. */
> > testl %eax, %eax
> > - jnz L(first_vec_x1)
> > + jnz L(first_vec_x1_check)
> >
> > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > + /* If remaining length > CHAR_PER_VEC * 2. */
> > + addl $(CHAR_PER_VEC * 2), %edx
> > + jg L(last_4x_vec)
> >
> > - jnz L(first_vec_x2_check)
> > - subl $VEC_SIZE, %edx
> > - jle L(zero)
> > +L(last_2x_vec):
> > + /* If remaining length < CHAR_PER_VEC. */
> > + addl $CHAR_PER_VEC, %edx
> > + jle L(zero_end)
> > +
> > + /* Check VEC2 and compare any match with
> > + remaining length. */
>
> Fit comments to 72 columns.
Fixed.
>
> > + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > + tzcntl %eax, %eax
> > + cmpl %eax, %edx
> > + jbe L(set_zero_end)
> > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +L(zero_end):
> > + ret
> >
> > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> >
> > - jnz L(first_vec_x3_check)
> > + .p2align 4
> > +L(first_vec_x1_check):
> > + tzcntl %eax, %eax
> > + /* Adjust length. */
> > + subl $-(CHAR_PER_VEC * 4), %edx
> > + /* Check if match within remaining length. */
> > + cmpl %eax, %edx
> > + jbe L(set_zero_end)
> > + /* NB: Multiply bytes by CHAR_SIZE to get the
> > + wchar_t count. */
>
> Fit comments to 72 columns.
Fixed.
>
> > + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> > +L(set_zero_end):
> > xorl %eax, %eax
> > ret
> >
> > .p2align 4
> > -L(last_2x_vec):
> > - addl $(VEC_SIZE * 2), %edx
> > - VPCMP $0, (%rdi), %YMMMATCH, %k1
> > +L(loop_4x_vec_end):
> > +# endif
> > + /* rawmemchr will fall through into this if match
> > + was found in loop. */
>
> Fit comments to 72 columns.
Fixed.
>
> > +
> > + /* k1 has not of matches with VEC1. */
> > kmovd %k1, %eax
> > - testl %eax, %eax
> > +# ifdef USE_AS_WMEMCHR
> > + subl $((1 << CHAR_PER_VEC) - 1), %eax
> > +# else
> > + incl %eax
> > +# endif
> > + jnz L(last_vec_x1_return)
> >
> > - jnz L(first_vec_x0_check)
> > - subl $VEC_SIZE, %edx
> > - jle L(zero)
> > + VPCMP $0, %YMM2, %YZERO, %k0
> > + kmovd %k0, %eax
> > + testl %eax, %eax
> > + jnz L(last_vec_x2_return)
> >
> > - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > + kmovd %k2, %eax
> > testl %eax, %eax
> > - jnz L(first_vec_x1_check)
> > - xorl %eax, %eax
> > - ret
> > + jnz L(last_vec_x3_return)
> >
> > - .p2align 4
> > -L(first_vec_x0_check):
> > + kmovd %k3, %eax
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - sall $2, %eax
> > +# ifdef USE_AS_RAWMEMCHR
> > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > + leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
> > # endif
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq %rdi, %rax
> > ret
> >
> > .p2align 4
> > -L(first_vec_x1_check):
> > +L(last_vec_x1_return):
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - sall $2, %eax
> > -# endif
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq $VEC_SIZE, %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > +# ifdef USE_AS_WMEMCHR
> > + /* NB: Multiply bytes by CHAR_SIZE to get the
> > + wchar_t count. */
>
> Fit comments to 72 columns.
Fixed.
>
> > + leaq (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > addq %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(first_vec_x2_check):
> > - tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - sall $2, %eax
> > +# endif
> > +# else
> > + /* NB: Multiply bytes by CHAR_SIZE to get the
> > + wchar_t count. */
>
> Fit comments to 72 columns.
Fixed.
>
> > + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> > # endif
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq $(VEC_SIZE * 2), %rax
> > - addq %rdi, %rax
> > ret
> >
> > .p2align 4
> > -L(first_vec_x3_check):
> > +L(last_vec_x2_return):
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - sall $2, %eax
> > +# ifdef USE_AS_RAWMEMCHR
> > + /* NB: Multiply bytes by CHAR_SIZE to get the
> > + wchar_t count. */
> > + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > + /* NB: Multiply bytes by CHAR_SIZE to get the
> > + wchar_t count. */
> > + leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
> > # endif
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq $(VEC_SIZE * 3), %rax
> > - addq %rdi, %rax
> > ret
> >
> > .p2align 4
> > -L(zero):
> > - xorl %eax, %eax
> > - ret
> > -# endif
> > -
> > - .p2align 4
> > -L(first_vec_x0):
> > +L(last_vec_x3_return):
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - leaq (%rdi, %rax, 4), %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > + /* NB: Multiply bytes by CHAR_SIZE to get the
> > + wchar_t count. */
>
> Fit comments to 72 columns.
Fixed.
>
> > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > # else
> > - addq %rdi, %rax
> > + /* NB: Multiply bytes by CHAR_SIZE to get the
> > + wchar_t count. */
>
> Fit comments to 72 columns.
Fixed.
>
> > + leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
> > # endif
> > ret
> >
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +L(last_4x_vec_or_less_cmpeq):
> > + VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > + subq $-(VEC_SIZE * 4), %rdi
> > + /* Check first VEC regardless. */
> > + testl %eax, %eax
> > + jnz L(first_vec_x1_check)
> > +
> > + /* If remaining length <= CHAR_PER_VEC * 2. */
> > + addl $(CHAR_PER_VEC * 2), %edx
> > + jle L(last_2x_vec)
> > +
> > .p2align 4
> > -L(first_vec_x1):
> > +L(last_4x_vec):
> > + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > + testl %eax, %eax
> > + jnz L(last_vec_x2)
> > +
> > +
> > + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > + /* Create mask for possible matches within
> > + remaining length. */
>
> Fit comments to 72 columns.
Fixed.
>
> > +# ifdef USE_AS_WMEMCHR
> > + movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
> > + bzhil %edx, %ecx, %ecx
> > +# else
> > + movq $-1, %rcx
> > + bzhiq %rdx, %rcx, %rcx
> > +# endif
> > + /* Test matches in data against length match. */
> > + andl %ecx, %eax
> > + jnz L(last_vec_x3)
> > +
> > + /* if remaining length <= CHAR_PER_VEC * 3 (Note
> > + this is after remaining length was found to be >
> > + CHAR_PER_VEC * 2. */
>
> Fit comments to 72 columns.
Fixed.
>
> > + subl $CHAR_PER_VEC, %edx
> > + jbe L(zero_end2)
> > +
> > +
> > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > + /* Shift remaining length mask for last VEC. */
> > +# ifdef USE_AS_WMEMCHR
> > + shrl $CHAR_PER_VEC, %ecx
> > +# else
> > + shrq $CHAR_PER_VEC, %rcx
> > +# endif
> > + andl %ecx, %eax
> > + jz L(zero_end2)
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - leaq VEC_SIZE(%rdi, %rax, 4), %rax
> > -# else
> > - addq $VEC_SIZE, %rax
> > - addq %rdi, %rax
> > -# endif
> > + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> > +L(zero_end2):
> > ret
> >
> > - .p2align 4
> > -L(first_vec_x2):
> > +L(last_vec_x2):
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
> > -# else
> > - addq $(VEC_SIZE * 2), %rax
> > - addq %rdi, %rax
> > -# endif
> > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > ret
> >
> > .p2align 4
> > -L(4x_vec_end):
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x0)
> > - kmovd %k2, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x1)
> > - kmovd %k3, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x2)
> > - kmovd %k4, %eax
> > - testl %eax, %eax
> > -L(first_vec_x3):
> > +L(last_vec_x3):
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
> > -# else
> > - addq $(VEC_SIZE * 3), %rax
> > - addq %rdi, %rax
> > -# endif
> > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > ret
> > +# endif
> >
> > -END (MEMCHR)
> > +END(MEMCHR)
>
> No need for this change.Fixed.
>
> > #endif
> > --
> > 2.29.2
> >
>
> Thanks.
>
> H.J.
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH v2 2/3] x86: Optimize memchr-avx2.S
2021-05-03 8:44 ` [PATCH v1 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
2021-05-03 18:50 ` H.J. Lu
@ 2021-05-03 20:06 ` Noah Goldstein
2021-05-03 20:06 ` [PATCH v2 3/3] x86: Optimize memchr-evex.S Noah Goldstein
2021-05-03 22:25 ` [PATCH v2 2/3] x86: Optimize memchr-avx2.S H.J. Lu
2021-05-03 22:58 ` [PATCH v3 " Noah Goldstein
2 siblings, 2 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 20:06 UTC (permalink / raw)
To: libc-alpha
No bug. This commit optimizes memchr-avx2.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
asaving a few instructions the in loop return loop. test-memchr,
test-rawmemchr, and test-wmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/memchr-avx2.S | 426 ++++++++++++++-----------
1 file changed, 247 insertions(+), 179 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 1fcb1c350f..8b862fb9d1 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -26,8 +26,22 @@
# ifdef USE_AS_WMEMCHR
# define VPCMPEQ vpcmpeqd
+# define VPBROADCAST vpbroadcastd
+# define CHAR_SIZE 4
# else
# define VPCMPEQ vpcmpeqb
+# define VPBROADCAST vpbroadcastb
+# define CHAR_SIZE 1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+# define ERAW_PTR_REG ecx
+# define RRAW_PTR_REG rcx
+# define ALGN_PTR_REG rdi
+# else
+# define ERAW_PTR_REG edi
+# define RRAW_PTR_REG rdi
+# define ALGN_PTR_REG rcx
# endif
# ifndef VZEROUPPER
@@ -39,6 +53,7 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
.section SECTION(.text),"ax",@progbits
ENTRY (MEMCHR)
@@ -47,295 +62,348 @@ ENTRY (MEMCHR)
test %RDX_LP, %RDX_LP
jz L(null)
# endif
- movl %edi, %ecx
- /* Broadcast CHAR to YMM0. */
- vmovd %esi, %xmm0
# ifdef USE_AS_WMEMCHR
shl $2, %RDX_LP
- vpbroadcastd %xmm0, %ymm0
# else
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
- vpbroadcastb %xmm0, %ymm0
# endif
+ /* Broadcast CHAR to YMMMATCH. */
+ vmovd %esi, %xmm0
+ VPBROADCAST %xmm0, %ymm0
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
+ VPCMPEQ (%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
# ifndef USE_AS_RAWMEMCHR
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rdx
- jbe L(zero)
-# else
- jnz L(first_vec_x0)
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $VEC_SIZE, %rdx
+ jbe L(first_vec_x0)
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
- addq %rcx, %rdx
-
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
+ .p2align 5
+L(first_vec_x0):
+ /* Check if first match was before length. */
+ tzcntl %eax, %eax
+ xorl %ecx, %ecx
+ cmpl %eax, %edx
+ leaq (%rdi, %rax), %rax
+ cmovle %rcx, %rax
+ VZEROUPPER_RETURN
+L(null):
+ xorl %eax, %eax
+ ret
# endif
- jmp L(more_4x_vec)
-
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
+L(cross_page_boundary):
+ /* Save pointer before aligning as its original value is necessary
+ for computer return address if byte is found or adjusting length
+ if it is not and this is memchr. */
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
+ rdi for rawmemchr. */
+ orq $(VEC_SIZE - 1), %ALGN_PTR_REG
+ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Calculate length until end of page (length checked for a
+ match). */
+ leaq 1(%ALGN_PTR_REG), %rsi
+ subq %RRAW_PTR_REG, %rsi
+# endif
/* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
- tzcntl %eax, %eax
+ sarxl %ERAW_PTR_REG, %eax, %eax
# ifndef USE_AS_RAWMEMCHR
/* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ cmpq %rsi, %rdx
+ jbe L(first_vec_x0)
# endif
- addq %rdi, %rax
- addq %rcx, %rax
+ testl %eax, %eax
+ jz L(cross_page_continue)
+ tzcntl %eax, %eax
+ addq %RRAW_PTR_REG, %rax
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
- overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ incq %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- /* Check the end of data. */
- subq %rcx, %rdx
- jbe L(zero)
-# endif
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- addq $VEC_SIZE, %rdi
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 2 + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
-# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
-L(more_4x_vec):
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 3 + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
+
+ .p2align 4
+L(aligned_more):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+# ifndef USE_AS_RAWMEMCHR
+L(cross_page_continue):
+ /* Align data to VEC_SIZE - 1. */
+ xorl %ecx, %ecx
+ subl %edi, %ecx
+ orq $(VEC_SIZE - 1), %rdi
+ /* esi is for adjusting length to see if near the end. */
+ leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# else
+ orq $(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+# endif
+ /* Load first VEC regardless. */
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. If near end handle specially. */
+ subq %rsi, %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
# ifndef USE_AS_RAWMEMCHR
+ /* Check if at last VEC_SIZE * 4 length. */
subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
+ jbe L(last_4x_vec_or_less_cmpeq)
+ /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+ length. */
+ incq %rdi
+ movl %edi, %ecx
+ orq $(VEC_SIZE * 4 - 1), %rdi
+ andl $(VEC_SIZE * 4 - 1), %ecx
addq %rcx, %rdx
+# else
+ /* Align data to VEC_SIZE * 4 - 1 for loop. */
+ incq %rdi
+ orq $(VEC_SIZE * 4 - 1), %rdi
# endif
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
-
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
vpor %ymm1, %ymm2, %ymm5
vpor %ymm3, %ymm4, %ymm6
vpor %ymm5, %ymm6, %ymm5
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
+ vpmovmskb %ymm5, %ecx
# ifdef USE_AS_RAWMEMCHR
- jmp L(loop_4x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
+ testl %ecx, %ecx
+ jz L(loop_4x_vec)
# else
- subq $(VEC_SIZE * 4), %rdx
- ja L(loop_4x_vec)
+ testl %ecx, %ecx
+ jnz L(loop_4x_vec_end)
-L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %edx
- jle L(last_2x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
+ subq $(VEC_SIZE * 4), %rdx
+ ja L(loop_4x_vec)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ /* Fall through into less than 4 remaining vectors of length case.
+ */
+ VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+ .p2align 4
+L(last_4x_vec_or_less):
+ /* Check if first VEC contained match. */
testl %eax, %eax
- jnz L(first_vec_x1)
+ jnz L(first_vec_x1_check)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
+ /* If remaining length > VEC_SIZE * 2. */
+ addl $(VEC_SIZE * 2), %edx
+ jg L(last_4x_vec)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+L(last_2x_vec):
+ /* If remaining length < VEC_SIZE. */
+ addl $VEC_SIZE, %edx
+ jle L(zero_end)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ /* Check VEC2 and compare any match with remaining length. */
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
- jnz L(first_vec_x3_check)
- xorl %eax, %eax
+ tzcntl %eax, %eax
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ addq $(VEC_SIZE + 1), %rdi
+ addq %rdi, %rax
+L(zero_end):
VZEROUPPER_RETURN
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %edx
- VPCMPEQ (%rdi), %ymm0, %ymm1
+L(loop_4x_vec_end):
+# endif
+ /* rawmemchr will fall through into this if match was found in
+ loop. */
+
vpmovmskb %ymm1, %eax
testl %eax, %eax
+ jnz L(last_vec_x1_return)
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
-
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
- jnz L(first_vec_x1_check)
- xorl %eax, %eax
- VZEROUPPER_RETURN
+ jnz L(last_vec_x2_return)
- .p2align 4
-L(first_vec_x0_check):
- tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ vpmovmskb %ymm3, %eax
+ /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
+ salq $32, %rcx
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 2 - 1), %rdi
+# else
+ subq $-(VEC_SIZE * 2 + 1), %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
.p2align 4
L(first_vec_x1_check):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $VEC_SIZE, %rax
+ /* Adjust length. */
+ subl $-(VEC_SIZE * 4), %edx
+ /* Check if match within remaining length. */
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ incq %rdi
addq %rdi, %rax
VZEROUPPER_RETURN
+ .p2align 4
+L(set_zero_end):
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
+# endif
.p2align 4
-L(first_vec_x2_check):
+L(last_vec_x1_return):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 2), %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 4 - 1), %rdi
+# else
+ incq %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 3), %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 3 - 1), %rdi
+# else
+ subq $-(VEC_SIZE + 1), %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
.p2align 4
-L(zero):
- xorl %eax, %eax
- jmp L(return_vzeroupper)
+L(last_4x_vec_or_less_cmpeq):
+ VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Check first VEC regardless. */
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+ /* If remaining length <= CHAR_PER_VEC * 2. */
+ addl $(VEC_SIZE * 2), %edx
+ jle L(last_2x_vec)
.p2align 4
-L(null):
- xorl %eax, %eax
- ret
-# endif
+L(last_4x_vec):
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2_return)
- .p2align 4
-L(first_vec_x0):
- tzcntl %eax, %eax
- addq %rdi, %rax
- VZEROUPPER_RETURN
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
- .p2align 4
-L(first_vec_x1):
- tzcntl %eax, %eax
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
- VZEROUPPER_RETURN
+ /* Create mask for possible matches within remaining length. */
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
- .p2align 4
-L(first_vec_x2):
+ /* Test matches in data against length match. */
+ andl %ecx, %eax
+ jnz L(last_vec_x3)
+
+ /* if remaining length <= VEC_SIZE * 3 (Note this is after
+ remaining length was found to be > VEC_SIZE * 2. */
+ subl $VEC_SIZE, %edx
+ jbe L(zero_end2)
+
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Shift remaining length mask for last VEC. */
+ shrq $32, %rcx
+ andl %ecx, %eax
+ jz L(zero_end2)
tzcntl %eax, %eax
- addq $(VEC_SIZE * 2), %rax
+ addq $(VEC_SIZE * 3 + 1), %rdi
addq %rdi, %rax
+L(zero_end2):
VZEROUPPER_RETURN
.p2align 4
-L(4x_vec_end):
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- vpmovmskb %ymm2, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- vpmovmskb %ymm4, %eax
- testl %eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
tzcntl %eax, %eax
- addq $(VEC_SIZE * 3), %rax
+ subq $-(VEC_SIZE * 2 + 1), %rdi
addq %rdi, %rax
VZEROUPPER_RETURN
+# endif
END (MEMCHR)
#endif
--
2.29.2
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH v2 3/3] x86: Optimize memchr-evex.S
2021-05-03 20:06 ` [PATCH v2 " Noah Goldstein
@ 2021-05-03 20:06 ` Noah Goldstein
2021-05-03 22:26 ` H.J. Lu
2021-05-03 22:25 ` [PATCH v2 2/3] x86: Optimize memchr-avx2.S H.J. Lu
1 sibling, 1 reply; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 20:06 UTC (permalink / raw)
To: libc-alpha
No bug. This commit optimizes memchr-evex.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
saving some ALU in the alignment process, and most importantly
increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
test-wmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
1 file changed, 322 insertions(+), 225 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 6dd5d67b90..147d7aa8ee 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -26,14 +26,28 @@
# ifdef USE_AS_WMEMCHR
# define VPBROADCAST vpbroadcastd
-# define VPCMP vpcmpd
-# define SHIFT_REG r8d
+# define VPMINU vpminud
+# define VPCMP vpcmpd
+# define VPCMPEQ vpcmpeqd
+# define CHAR_SIZE 4
# else
# define VPBROADCAST vpbroadcastb
-# define VPCMP vpcmpb
-# define SHIFT_REG ecx
+# define VPMINU vpminub
+# define VPCMP vpcmpb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_SIZE 1
# endif
+# ifdef USE_AS_RAWMEMCHR
+# define RAW_PTR_REG rcx
+# define ALGN_PTR_REG rdi
+# else
+# define RAW_PTR_REG rdi
+# define ALGN_PTR_REG rcx
+# endif
+
+# define XZERO xmm23
+# define YZERO ymm23
# define XMMMATCH xmm16
# define YMMMATCH ymm16
# define YMM1 ymm17
@@ -44,6 +58,8 @@
# define YMM6 ymm22
# define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE 4096
.section .text.evex,"ax",@progbits
ENTRY (MEMCHR)
@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
/* Check for zero length. */
test %RDX_LP, %RDX_LP
jz L(zero)
-# endif
- movl %edi, %ecx
-# ifdef USE_AS_WMEMCHR
- shl $2, %RDX_LP
-# else
+
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
/* Broadcast CHAR to YMMMATCH. */
VPBROADCAST %esi, %YMMMATCH
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
-
+ VPCMP $0, (%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
# ifndef USE_AS_RAWMEMCHR
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rdx
- jbe L(zero)
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $CHAR_PER_VEC, %rdx
+ jbe L(first_vec_x0)
+# endif
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
- jnz L(first_vec_x0)
+ addq %rdi, %rax
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ ret
# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
- addq %rcx, %rdx
-
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
- jmp L(more_4x_vec)
+L(zero):
+ xorl %eax, %eax
+ ret
+ .p2align 5
+L(first_vec_x0):
+ /* Check if first match was before length. */
+ tzcntl %eax, %eax
+ xorl %ecx, %ecx
+ cmpl %eax, %edx
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+ cmovle %rcx, %rax
+ ret
+# else
+ /* NB: first_vec_x0 is 17 bytes which will leave
+ cross_page_boundary (which is relatively cold) close enough
+ to ideal alignment. So only realign L(cross_page_boundary) if
+ rawmemchr. */
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
+# endif
+L(cross_page_boundary):
+ /* Save pointer before aligning as its original value is
+ necessary for computer return address if byte is found or
+ adjusting length if it is not and this is memchr. */
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
+ for rawmemchr. */
+ andq $-VEC_SIZE, %ALGN_PTR_REG
+ VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+ kmovd %k0, %r8d
# ifdef USE_AS_WMEMCHR
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
bytes. */
- movl %ecx, %SHIFT_REG
- sarl $2, %SHIFT_REG
+ sarl $2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+ movl $(PAGE_SIZE / CHAR_SIZE), %esi
+ subl %eax, %esi
# endif
- andq $-VEC_SIZE, %rdi
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- /* Remove the leading bytes. */
- sarxl %SHIFT_REG, %eax, %eax
- testl %eax, %eax
- jz L(aligned_more)
- tzcntl %eax, %eax
# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+ andl $(CHAR_PER_VEC - 1), %eax
# endif
+ /* Remove the leading bytes. */
+ sarxl %eax, %r8d, %eax
# ifndef USE_AS_RAWMEMCHR
/* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ cmpq %rsi, %rdx
+ jbe L(first_vec_x0)
+# endif
+ testl %eax, %eax
+ jz L(cross_page_continue)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+# else
+ addq %RAW_PTR_REG, %rax
# endif
- addq %rdi, %rax
- addq %rcx, %rax
ret
.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
- overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- /* Check the end of data. */
- subq %rcx, %rdx
- jbe L(zero)
-# endif
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- addq $VEC_SIZE, %rdi
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
-# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
-L(more_4x_vec):
+ .p2align 5
+L(aligned_more):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Align data to VEC_SIZE. */
+L(cross_page_continue):
+ xorl %ecx, %ecx
+ subl %edi, %ecx
+ andq $-VEC_SIZE, %rdi
+ /* esi is for adjusting length to see if near the end. */
+ leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %esi
+# endif
+# else
+ andq $-VEC_SIZE, %rdi
+L(cross_page_continue):
+# endif
+ /* Load first VEC regardless. */
+ VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. If near end handle specially. */
+ subq %rsi, %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
+
# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
+ /* Check if at last CHAR_PER_VEC * 4 length. */
+ subq $(CHAR_PER_VEC * 4), %rdx
+ jbe L(last_4x_vec_or_less_cmpeq)
+ addq $VEC_SIZE, %rdi
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
+ /* Align data to VEC_SIZE * 4 for the loop and readjust length.
+ */
+# ifdef USE_AS_WMEMCHR
+ movl %edi, %ecx
andq $-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
+ andl $(VEC_SIZE * 4 - 1), %ecx
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
addq %rcx, %rdx
+# else
+ addq %rdi, %rdx
+ andq $-(4 * VEC_SIZE), %rdi
+ subq %rdi, %rdx
+# endif
+# else
+ addq $VEC_SIZE, %rdi
+ andq $-(4 * VEC_SIZE), %rdi
# endif
+ vpxorq %XZERO, %XZERO, %XZERO
+
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
- kord %k1, %k2, %k5
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
- kord %k3, %k4, %k6
- kortestd %k5, %k6
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
+ /* It would be possible to save some instructions using 4x VPCMP
+ but bottleneck on port 5 makes it not woth it. */
+ VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+ /* xor will set bytes match esi to zero. */
+ vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+ vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+ VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+ /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
+ VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
+ VPCMP $0, %YMM3, %YZERO, %k2
# ifdef USE_AS_RAWMEMCHR
- jmp L(loop_4x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
+ kortestd %k2, %k3
+ jz L(loop_4x_vec)
# else
- subq $(VEC_SIZE * 4), %rdx
+ kortestd %k2, %k3
+ jnz L(loop_4x_vec_end)
+
+ subq $-(VEC_SIZE * 4), %rdi
+
+ subq $(CHAR_PER_VEC * 4), %rdx
ja L(loop_4x_vec)
+ /* Fall through into less than 4 remaining vectors of length case.
+ */
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ addq $(VEC_SIZE * 3), %rdi
+ .p2align 4
L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %edx
- jle L(last_2x_vec)
-
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ /* Check if first VEC contained match. */
testl %eax, %eax
- jnz L(first_vec_x0)
+ jnz L(first_vec_x1_check)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
+ /* If remaining length > CHAR_PER_VEC * 2. */
+ addl $(CHAR_PER_VEC * 2), %edx
+ jg L(last_4x_vec)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+L(last_2x_vec):
+ /* If remaining length < CHAR_PER_VEC. */
+ addl $CHAR_PER_VEC, %edx
+ jle L(zero_end)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+ /* Check VEC2 and compare any match with remaining length. */
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ tzcntl %eax, %eax
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end):
+ ret
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x3_check)
+ .p2align 4
+L(first_vec_x1_check):
+ tzcntl %eax, %eax
+ /* Adjust length. */
+ subl $-(CHAR_PER_VEC * 4), %edx
+ /* Check if match within remaining length. */
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+L(set_zero_end):
xorl %eax, %eax
ret
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %edx
- VPCMP $0, (%rdi), %YMMMATCH, %k1
+L(loop_4x_vec_end):
+# endif
+ /* rawmemchr will fall through into this if match was found in
+ loop. */
+
+ /* k1 has not of matches with VEC1. */
kmovd %k1, %eax
- testl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ subl $((1 << CHAR_PER_VEC) - 1), %eax
+# else
+ incl %eax
+# endif
+ jnz L(last_vec_x1_return)
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+ VPCMP $0, %YMM2, %YZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2_return)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ kmovd %k2, %eax
testl %eax, %eax
- jnz L(first_vec_x1_check)
- xorl %eax, %eax
- ret
+ jnz L(last_vec_x3_return)
- .p2align 4
-L(first_vec_x0_check):
+ kmovd %k3, %eax
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# ifdef USE_AS_RAWMEMCHR
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq %rdi, %rax
ret
.p2align 4
-L(first_vec_x1_check):
+L(last_vec_x1_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $VEC_SIZE, %rax
+# ifdef USE_AS_RAWMEMCHR
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
addq %rdi, %rax
- ret
-
- .p2align 4
-L(first_vec_x2_check):
- tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# endif
+# else
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
ret
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# ifdef USE_AS_RAWMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
ret
.p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-# endif
-
- .p2align 4
-L(first_vec_x0):
+L(last_vec_x3_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (%rdi, %rax, 4), %rax
+# ifdef USE_AS_RAWMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
# else
- addq %rdi, %rax
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
# endif
ret
+
+# ifndef USE_AS_RAWMEMCHR
+L(last_4x_vec_or_less_cmpeq):
+ VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Check first VEC regardless. */
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+
+ /* If remaining length <= CHAR_PER_VEC * 2. */
+ addl $(CHAR_PER_VEC * 2), %edx
+ jle L(last_2x_vec)
+
.p2align 4
-L(first_vec_x1):
+L(last_4x_vec):
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ /* Create mask for possible matches within remaining length. */
+# ifdef USE_AS_WMEMCHR
+ movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+ bzhil %edx, %ecx, %ecx
+# else
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
+# endif
+ /* Test matches in data against length match. */
+ andl %ecx, %eax
+ jnz L(last_vec_x3)
+
+ /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
+ remaining length was found to be > CHAR_PER_VEC * 2. */
+ subl $CHAR_PER_VEC, %edx
+ jbe L(zero_end2)
+
+
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ /* Shift remaining length mask for last VEC. */
+# ifdef USE_AS_WMEMCHR
+ shrl $CHAR_PER_VEC, %ecx
+# else
+ shrq $CHAR_PER_VEC, %rcx
+# endif
+ andl %ecx, %eax
+ jz L(zero_end2)
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end2):
ret
- .p2align 4
-L(first_vec_x2):
+L(last_vec_x2):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
-# else
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
ret
.p2align 4
-L(4x_vec_end):
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- kmovd %k2, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- kmovd %k3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- kmovd %k4, %eax
- testl %eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
-# else
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
ret
+# endif
END (MEMCHR)
#endif
--
2.29.2
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v1 1/3] Bench: Expand bench-memchr.c
2021-05-03 19:51 ` Noah Goldstein
@ 2021-05-03 20:59 ` H.J. Lu
0 siblings, 0 replies; 20+ messages in thread
From: H.J. Lu @ 2021-05-03 20:59 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Mon, May 3, 2021 at 12:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, May 3, 2021 at 1:18 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, May 3, 2021 at 1:45 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > No bug. This commit adds some additional cases for bench-memchr.c
> > > including testing medium sizes and testing short length with both an
> > > inbound match and out of bound match.
> > >
> > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > ---
> > > benchtests/bench-memchr.c | 13 +++++++++++++
> > > 1 file changed, 13 insertions(+)
> > >
> > > diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
> > > index f5ced9d80d..5573f93312 100644
> > > --- a/benchtests/bench-memchr.c
> > > +++ b/benchtests/bench-memchr.c
> > > @@ -135,12 +135,25 @@ test_main (void)
> > > do_test (i, i, 256, 0);
> > > #endif
> > > }
> > > + for (i = 1; i < 8; ++i)
> > > + {
> > > + do_test (i, i << 5, 192, 23);
> > > + do_test (i, i << 5, 192, 0);
> > > + do_test (i, i << 5, 256, 23);
> > > + do_test (i, i << 5, 256, 0);
> > > + do_test (i, i << 5, 512, 23);
> > > + do_test (i, i << 5, 512, 0);
> > > + }
> > > for (i = 1; i < 32; ++i)
> > > {
> > > do_test (0, i, i + 1, 23);
> > > do_test (0, i, i + 1, 0);
> > > do_test (i, i, i + 1, 23);
> > > do_test (i, i, i + 1, 0);
> > > + do_test (0, i, i - 1, 23);
> > > + do_test (0, i, i - 1, 0);
> > > + do_test (i, i, i - 1, 23);
> > > + do_test (i, i, i - 1, 0);
> > > #ifdef USE_AS_MEMRCHR
> > > /* Also test the position close to the beginning for memrchr. */
> > > do_test (0, 1, i + 1, 23);
> > > --
> > > 2.29.2
> > >
> >
> > LGTM. I will check it in for you.
>
> Thanks!
>
> >
> > BTW, can you apply an account on sourceware.org:
> >
> > https://sourceware.org/
> >
> > so that you can push your commits directly? You can put me down
> > as your sponsor.
>
> Done. Are there any wikis / manuals on how to properly use write access?
https://sourceware.org/glibc/wiki/HomePage
has a lot of good information.
> All I'm finding are resources on how to obtain it.
>
> >
> > Thanks.
> >
> > --
> > H.J.
--
H.J.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 2/3] x86: Optimize memchr-avx2.S
2021-05-03 20:06 ` [PATCH v2 " Noah Goldstein
2021-05-03 20:06 ` [PATCH v2 3/3] x86: Optimize memchr-evex.S Noah Goldstein
@ 2021-05-03 22:25 ` H.J. Lu
2021-05-03 22:58 ` Noah Goldstein
1 sibling, 1 reply; 20+ messages in thread
From: H.J. Lu @ 2021-05-03 22:25 UTC (permalink / raw)
To: Noah Goldstein; +Cc: libc-alpha, carlos, hjl.tools
On Mon, May 03, 2021 at 04:06:54PM -0400, Noah Goldstein wrote:
> No bug. This commit optimizes memchr-avx2.S. The optimizations include
> replacing some branches with cmovcc, avoiding some branches entirely
> in the less_4x_vec case, making the page cross logic less strict,
> asaving a few instructions the in loop return loop. test-memchr,
> test-rawmemchr, and test-wmemchr are all passing.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> sysdeps/x86_64/multiarch/memchr-avx2.S | 426 ++++++++++++++-----------
> 1 file changed, 247 insertions(+), 179 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> index 1fcb1c350f..8b862fb9d1 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> @@ -26,8 +26,22 @@
>
> # ifdef USE_AS_WMEMCHR
> # define VPCMPEQ vpcmpeqd
> +# define VPBROADCAST vpbroadcastd
> +# define CHAR_SIZE 4
> # else
> # define VPCMPEQ vpcmpeqb
> +# define VPBROADCAST vpbroadcastb
> +# define CHAR_SIZE 1
> +# endif
> +
> +# ifdef USE_AS_RAWMEMCHR
> +# define ERAW_PTR_REG ecx
> +# define RRAW_PTR_REG rcx
> +# define ALGN_PTR_REG rdi
> +# else
> +# define ERAW_PTR_REG edi
> +# define RRAW_PTR_REG rdi
> +# define ALGN_PTR_REG rcx
> # endif
>
> # ifndef VZEROUPPER
> @@ -39,6 +53,7 @@
> # endif
>
> # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
>
> .section SECTION(.text),"ax",@progbits
> ENTRY (MEMCHR)
> @@ -47,295 +62,348 @@ ENTRY (MEMCHR)
> test %RDX_LP, %RDX_LP
> jz L(null)
> # endif
> - movl %edi, %ecx
> - /* Broadcast CHAR to YMM0. */
> - vmovd %esi, %xmm0
> # ifdef USE_AS_WMEMCHR
> shl $2, %RDX_LP
> - vpbroadcastd %xmm0, %ymm0
> # else
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> movl %edx, %edx
> # endif
> - vpbroadcastb %xmm0, %ymm0
> # endif
> + /* Broadcast CHAR to YMMMATCH. */
> + vmovd %esi, %xmm0
> + VPBROADCAST %xmm0, %ymm0
> /* Check if we may cross page boundary with one vector load. */
> - andl $(2 * VEC_SIZE - 1), %ecx
> - cmpl $VEC_SIZE, %ecx
> - ja L(cros_page_boundary)
> + movl %edi, %eax
> + andl $(PAGE_SIZE - 1), %eax
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> + ja L(cross_page_boundary)
>
> /* Check the first VEC_SIZE bytes. */
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> + VPCMPEQ (%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> -
> # ifndef USE_AS_RAWMEMCHR
> - jnz L(first_vec_x0_check)
> - /* Adjust length and check the end of data. */
> - subq $VEC_SIZE, %rdx
> - jbe L(zero)
> -# else
> - jnz L(first_vec_x0)
> + /* If length < CHAR_PER_VEC handle special. */
> + cmpq $VEC_SIZE, %rdx
> + jbe L(first_vec_x0)
> # endif
> -
> - /* Align data for aligned loads in the loop. */
> - addq $VEC_SIZE, %rdi
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> + testl %eax, %eax
> + jz L(aligned_more)
> + tzcntl %eax, %eax
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> # ifndef USE_AS_RAWMEMCHR
> - /* Adjust length. */
> - addq %rcx, %rdx
> -
> - subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> + .p2align 5
> +L(first_vec_x0):
> + /* Check if first match was before length. */
> + tzcntl %eax, %eax
> + xorl %ecx, %ecx
> + cmpl %eax, %edx
> + leaq (%rdi, %rax), %rax
> + cmovle %rcx, %rax
> + VZEROUPPER_RETURN
Please add a blank line here to indicate this begin a new block.
OK with this change. You should be able to push it yourself now.
Thanks.
> +L(null):
> + xorl %eax, %eax
> + ret
> # endif
> - jmp L(more_4x_vec)
> -
> .p2align 4
> -L(cros_page_boundary):
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> +L(cross_page_boundary):
> + /* Save pointer before aligning as its original value is necessary
> + for computer return address if byte is found or adjusting length
> + if it is not and this is memchr. */
> + movq %rdi, %rcx
> + /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
> + rdi for rawmemchr. */
> + orq $(VEC_SIZE - 1), %ALGN_PTR_REG
> + VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> + /* Calculate length until end of page (length checked for a
> + match). */
> + leaq 1(%ALGN_PTR_REG), %rsi
> + subq %RRAW_PTR_REG, %rsi
> +# endif
> /* Remove the leading bytes. */
> - sarl %cl, %eax
> - testl %eax, %eax
> - jz L(aligned_more)
> - tzcntl %eax, %eax
> + sarxl %ERAW_PTR_REG, %eax, %eax
> # ifndef USE_AS_RAWMEMCHR
> /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> + cmpq %rsi, %rdx
> + jbe L(first_vec_x0)
> # endif
> - addq %rdi, %rax
> - addq %rcx, %rax
> + testl %eax, %eax
> + jz L(cross_page_continue)
> + tzcntl %eax, %eax
> + addq %RRAW_PTR_REG, %rax
> L(return_vzeroupper):
> ZERO_UPPER_VEC_REGISTERS_RETURN
>
> .p2align 4
> -L(aligned_more):
> -# ifndef USE_AS_RAWMEMCHR
> - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> - overflow. */
> - negq %rcx
> - addq $VEC_SIZE, %rcx
> +L(first_vec_x1):
> + tzcntl %eax, %eax
> + incq %rdi
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> - /* Check the end of data. */
> - subq %rcx, %rdx
> - jbe L(zero)
> -# endif
> + .p2align 4
> +L(first_vec_x2):
> + tzcntl %eax, %eax
> + addq $(VEC_SIZE + 1), %rdi
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> - addq $VEC_SIZE, %rdi
> + .p2align 4
> +L(first_vec_x3):
> + tzcntl %eax, %eax
> + addq $(VEC_SIZE * 2 + 1), %rdi
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> -# ifndef USE_AS_RAWMEMCHR
> - subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> -# endif
>
> -L(more_4x_vec):
> + .p2align 4
> +L(first_vec_x4):
> + tzcntl %eax, %eax
> + addq $(VEC_SIZE * 3 + 1), %rdi
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
> +
> + .p2align 4
> +L(aligned_more):
> /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> since data is only aligned to VEC_SIZE. */
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
>
> - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> +# ifndef USE_AS_RAWMEMCHR
> +L(cross_page_continue):
> + /* Align data to VEC_SIZE - 1. */
> + xorl %ecx, %ecx
> + subl %edi, %ecx
> + orq $(VEC_SIZE - 1), %rdi
> + /* esi is for adjusting length to see if near the end. */
> + leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> +# else
> + orq $(VEC_SIZE - 1), %rdi
> +L(cross_page_continue):
> +# endif
> + /* Load first VEC regardless. */
> + VPCMPEQ 1(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> + /* Adjust length. If near end handle specially. */
> + subq %rsi, %rdx
> + jbe L(last_4x_vec_or_less)
> +# endif
> testl %eax, %eax
> jnz L(first_vec_x1)
>
> - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x2)
>
> - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x3)
>
> - addq $(VEC_SIZE * 4), %rdi
> + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
> + testl %eax, %eax
> + jnz L(first_vec_x4)
>
> # ifndef USE_AS_RAWMEMCHR
> + /* Check if at last VEC_SIZE * 4 length. */
> subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> -# endif
> -
> - /* Align data to 4 * VEC_SIZE. */
> - movq %rdi, %rcx
> - andl $(4 * VEC_SIZE - 1), %ecx
> - andq $-(4 * VEC_SIZE), %rdi
> -
> -# ifndef USE_AS_RAWMEMCHR
> - /* Adjust length. */
> + jbe L(last_4x_vec_or_less_cmpeq)
> + /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
> + length. */
> + incq %rdi
> + movl %edi, %ecx
> + orq $(VEC_SIZE * 4 - 1), %rdi
> + andl $(VEC_SIZE * 4 - 1), %ecx
> addq %rcx, %rdx
> +# else
> + /* Align data to VEC_SIZE * 4 - 1 for loop. */
> + incq %rdi
> + orq $(VEC_SIZE * 4 - 1), %rdi
> # endif
>
> + /* Compare 4 * VEC at a time forward. */
> .p2align 4
> L(loop_4x_vec):
> - /* Compare 4 * VEC at a time forward. */
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
> - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
> - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
> -
> + VPCMPEQ 1(%rdi), %ymm0, %ymm1
> + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
> + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
> + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
> vpor %ymm1, %ymm2, %ymm5
> vpor %ymm3, %ymm4, %ymm6
> vpor %ymm5, %ymm6, %ymm5
>
> - vpmovmskb %ymm5, %eax
> - testl %eax, %eax
> - jnz L(4x_vec_end)
> -
> - addq $(VEC_SIZE * 4), %rdi
> -
> + vpmovmskb %ymm5, %ecx
> # ifdef USE_AS_RAWMEMCHR
> - jmp L(loop_4x_vec)
> + subq $-(VEC_SIZE * 4), %rdi
> + testl %ecx, %ecx
> + jz L(loop_4x_vec)
> # else
> - subq $(VEC_SIZE * 4), %rdx
> - ja L(loop_4x_vec)
> + testl %ecx, %ecx
> + jnz L(loop_4x_vec_end)
>
> -L(last_4x_vec_or_less):
> - /* Less than 4 * VEC and aligned to VEC_SIZE. */
> - addl $(VEC_SIZE * 2), %edx
> - jle L(last_2x_vec)
> + subq $-(VEC_SIZE * 4), %rdi
>
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
> + subq $(VEC_SIZE * 4), %rdx
> + ja L(loop_4x_vec)
>
> - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> + /* Fall through into less than 4 remaining vectors of length case.
> + */
> + VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> + .p2align 4
> +L(last_4x_vec_or_less):
> + /* Check if first VEC contained match. */
> testl %eax, %eax
> - jnz L(first_vec_x1)
> + jnz L(first_vec_x1_check)
>
> - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> + /* If remaining length > VEC_SIZE * 2. */
> + addl $(VEC_SIZE * 2), %edx
> + jg L(last_4x_vec)
>
> - jnz L(first_vec_x2_check)
> - subl $VEC_SIZE, %edx
> - jle L(zero)
> +L(last_2x_vec):
> + /* If remaining length < VEC_SIZE. */
> + addl $VEC_SIZE, %edx
> + jle L(zero_end)
>
> - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> + /* Check VEC2 and compare any match with remaining length. */
> + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> -
> - jnz L(first_vec_x3_check)
> - xorl %eax, %eax
> + tzcntl %eax, %eax
> + cmpl %eax, %edx
> + jbe L(set_zero_end)
> + addq $(VEC_SIZE + 1), %rdi
> + addq %rdi, %rax
> +L(zero_end):
> VZEROUPPER_RETURN
>
> .p2align 4
> -L(last_2x_vec):
> - addl $(VEC_SIZE * 2), %edx
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> +L(loop_4x_vec_end):
> +# endif
> + /* rawmemchr will fall through into this if match was found in
> + loop. */
> +
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> + jnz L(last_vec_x1_return)
>
> - jnz L(first_vec_x0_check)
> - subl $VEC_SIZE, %edx
> - jle L(zero)
> -
> - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm2, %eax
> testl %eax, %eax
> - jnz L(first_vec_x1_check)
> - xorl %eax, %eax
> - VZEROUPPER_RETURN
> + jnz L(last_vec_x2_return)
>
> - .p2align 4
> -L(first_vec_x0_check):
> - tzcntl %eax, %eax
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> + vpmovmskb %ymm3, %eax
> + /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
> + salq $32, %rcx
> + orq %rcx, %rax
> + tzcntq %rax, %rax
> +# ifdef USE_AS_RAWMEMCHR
> + subq $(VEC_SIZE * 2 - 1), %rdi
> +# else
> + subq $-(VEC_SIZE * 2 + 1), %rdi
> +# endif
> addq %rdi, %rax
> VZEROUPPER_RETURN
> +# ifndef USE_AS_RAWMEMCHR
>
> .p2align 4
> L(first_vec_x1_check):
> tzcntl %eax, %eax
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $VEC_SIZE, %rax
> + /* Adjust length. */
> + subl $-(VEC_SIZE * 4), %edx
> + /* Check if match within remaining length. */
> + cmpl %eax, %edx
> + jbe L(set_zero_end)
> + incq %rdi
> addq %rdi, %rax
> VZEROUPPER_RETURN
> + .p2align 4
> +L(set_zero_end):
> + xorl %eax, %eax
> + VZEROUPPER_RETURN
> +# endif
>
> .p2align 4
> -L(first_vec_x2_check):
> +L(last_vec_x1_return):
> tzcntl %eax, %eax
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $(VEC_SIZE * 2), %rax
> +# ifdef USE_AS_RAWMEMCHR
> + subq $(VEC_SIZE * 4 - 1), %rdi
> +# else
> + incq %rdi
> +# endif
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> .p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x2_return):
> tzcntl %eax, %eax
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $(VEC_SIZE * 3), %rax
> +# ifdef USE_AS_RAWMEMCHR
> + subq $(VEC_SIZE * 3 - 1), %rdi
> +# else
> + subq $-(VEC_SIZE + 1), %rdi
> +# endif
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> +# ifndef USE_AS_RAWMEMCHR
> .p2align 4
> -L(zero):
> - xorl %eax, %eax
> - jmp L(return_vzeroupper)
> +L(last_4x_vec_or_less_cmpeq):
> + VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
> + subq $-(VEC_SIZE * 4), %rdi
> + /* Check first VEC regardless. */
> + testl %eax, %eax
> + jnz L(first_vec_x1_check)
>
> + /* If remaining length <= CHAR_PER_VEC * 2. */
> + addl $(VEC_SIZE * 2), %edx
> + jle L(last_2x_vec)
> .p2align 4
> -L(null):
> - xorl %eax, %eax
> - ret
> -# endif
> +L(last_4x_vec):
> + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
> + testl %eax, %eax
> + jnz L(last_vec_x2_return)
>
> - .p2align 4
> -L(first_vec_x0):
> - tzcntl %eax, %eax
> - addq %rdi, %rax
> - VZEROUPPER_RETURN
> + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
>
> - .p2align 4
> -L(first_vec_x1):
> - tzcntl %eax, %eax
> - addq $VEC_SIZE, %rax
> - addq %rdi, %rax
> - VZEROUPPER_RETURN
> + /* Create mask for possible matches within remaining length. */
> + movq $-1, %rcx
> + bzhiq %rdx, %rcx, %rcx
>
> - .p2align 4
> -L(first_vec_x2):
> + /* Test matches in data against length match. */
> + andl %ecx, %eax
> + jnz L(last_vec_x3)
> +
> + /* if remaining length <= VEC_SIZE * 3 (Note this is after
> + remaining length was found to be > VEC_SIZE * 2. */
> + subl $VEC_SIZE, %edx
> + jbe L(zero_end2)
> +
> + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
> + /* Shift remaining length mask for last VEC. */
> + shrq $32, %rcx
> + andl %ecx, %eax
> + jz L(zero_end2)
> tzcntl %eax, %eax
> - addq $(VEC_SIZE * 2), %rax
> + addq $(VEC_SIZE * 3 + 1), %rdi
> addq %rdi, %rax
> +L(zero_end2):
> VZEROUPPER_RETURN
>
> .p2align 4
> -L(4x_vec_end):
> - vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
> - vpmovmskb %ymm2, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x1)
> - vpmovmskb %ymm3, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x2)
> - vpmovmskb %ymm4, %eax
> - testl %eax, %eax
> -L(first_vec_x3):
> +L(last_vec_x3):
> tzcntl %eax, %eax
> - addq $(VEC_SIZE * 3), %rax
> + subq $-(VEC_SIZE * 2 + 1), %rdi
> addq %rdi, %rax
> VZEROUPPER_RETURN
> +# endif
>
> END (MEMCHR)
> #endif
> --
> 2.29.2
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 3/3] x86: Optimize memchr-evex.S
2021-05-03 20:06 ` [PATCH v2 3/3] x86: Optimize memchr-evex.S Noah Goldstein
@ 2021-05-03 22:26 ` H.J. Lu
2021-05-03 22:58 ` Noah Goldstein
0 siblings, 1 reply; 20+ messages in thread
From: H.J. Lu @ 2021-05-03 22:26 UTC (permalink / raw)
To: Noah Goldstein; +Cc: libc-alpha, carlos, hjl.tools
On Mon, May 03, 2021 at 04:06:55PM -0400, Noah Goldstein wrote:
> No bug. This commit optimizes memchr-evex.S. The optimizations include
> replacing some branches with cmovcc, avoiding some branches entirely
> in the less_4x_vec case, making the page cross logic less strict,
> saving some ALU in the alignment process, and most importantly
> increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
> test-wmemchr are all passing.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
> 1 file changed, 322 insertions(+), 225 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> index 6dd5d67b90..147d7aa8ee 100644
> --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> @@ -26,14 +26,28 @@
>
> # ifdef USE_AS_WMEMCHR
> # define VPBROADCAST vpbroadcastd
> -# define VPCMP vpcmpd
> -# define SHIFT_REG r8d
> +# define VPMINU vpminud
> +# define VPCMP vpcmpd
> +# define VPCMPEQ vpcmpeqd
> +# define CHAR_SIZE 4
> # else
> # define VPBROADCAST vpbroadcastb
> -# define VPCMP vpcmpb
> -# define SHIFT_REG ecx
> +# define VPMINU vpminub
> +# define VPCMP vpcmpb
> +# define VPCMPEQ vpcmpeqb
> +# define CHAR_SIZE 1
> # endif
>
> +# ifdef USE_AS_RAWMEMCHR
> +# define RAW_PTR_REG rcx
> +# define ALGN_PTR_REG rdi
> +# else
> +# define RAW_PTR_REG rdi
> +# define ALGN_PTR_REG rcx
> +# endif
> +
> +# define XZERO xmm23
> +# define YZERO ymm23
Please rename XZERO/YZERO to XMMZERO/YMMZERO. OK with this change.
Thanks.
> # define XMMMATCH xmm16
> # define YMMMATCH ymm16
> # define YMM1 ymm17
> @@ -44,6 +58,8 @@
> # define YMM6 ymm22
>
> # define VEC_SIZE 32
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> +# define PAGE_SIZE 4096
>
> .section .text.evex,"ax",@progbits
> ENTRY (MEMCHR)
> @@ -51,11 +67,7 @@ ENTRY (MEMCHR)
> /* Check for zero length. */
> test %RDX_LP, %RDX_LP
> jz L(zero)
> -# endif
> - movl %edi, %ecx
> -# ifdef USE_AS_WMEMCHR
> - shl $2, %RDX_LP
> -# else
> +
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> movl %edx, %edx
> @@ -64,318 +76,403 @@ ENTRY (MEMCHR)
> /* Broadcast CHAR to YMMMATCH. */
> VPBROADCAST %esi, %YMMMATCH
> /* Check if we may cross page boundary with one vector load. */
> - andl $(2 * VEC_SIZE - 1), %ecx
> - cmpl $VEC_SIZE, %ecx
> - ja L(cros_page_boundary)
> + movl %edi, %eax
> + andl $(PAGE_SIZE - 1), %eax
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> + ja L(cross_page_boundary)
>
> /* Check the first VEC_SIZE bytes. */
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> -
> + VPCMP $0, (%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> # ifndef USE_AS_RAWMEMCHR
> - jnz L(first_vec_x0_check)
> - /* Adjust length and check the end of data. */
> - subq $VEC_SIZE, %rdx
> - jbe L(zero)
> + /* If length < CHAR_PER_VEC handle special. */
> + cmpq $CHAR_PER_VEC, %rdx
> + jbe L(first_vec_x0)
> +# endif
> + testl %eax, %eax
> + jz L(aligned_more)
> + tzcntl %eax, %eax
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> # else
> - jnz L(first_vec_x0)
> + addq %rdi, %rax
> # endif
> -
> - /* Align data for aligned loads in the loop. */
> - addq $VEC_SIZE, %rdi
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> + ret
>
> # ifndef USE_AS_RAWMEMCHR
> - /* Adjust length. */
> - addq %rcx, %rdx
> -
> - subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> -# endif
> - jmp L(more_4x_vec)
> +L(zero):
> + xorl %eax, %eax
> + ret
>
> + .p2align 5
> +L(first_vec_x0):
> + /* Check if first match was before length. */
> + tzcntl %eax, %eax
> + xorl %ecx, %ecx
> + cmpl %eax, %edx
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> + cmovle %rcx, %rax
> + ret
> +# else
> + /* NB: first_vec_x0 is 17 bytes which will leave
> + cross_page_boundary (which is relatively cold) close enough
> + to ideal alignment. So only realign L(cross_page_boundary) if
> + rawmemchr. */
> .p2align 4
> -L(cros_page_boundary):
> - andl $(VEC_SIZE - 1), %ecx
> +# endif
> +L(cross_page_boundary):
> + /* Save pointer before aligning as its original value is
> + necessary for computer return address if byte is found or
> + adjusting length if it is not and this is memchr. */
> + movq %rdi, %rcx
> + /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
> + for rawmemchr. */
> + andq $-VEC_SIZE, %ALGN_PTR_REG
> + VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
> + kmovd %k0, %r8d
> # ifdef USE_AS_WMEMCHR
> - /* NB: Divide shift count by 4 since each bit in K1 represent 4
> + /* NB: Divide shift count by 4 since each bit in K0 represent 4
> bytes. */
> - movl %ecx, %SHIFT_REG
> - sarl $2, %SHIFT_REG
> + sarl $2, %eax
> +# endif
> +# ifndef USE_AS_RAWMEMCHR
> + movl $(PAGE_SIZE / CHAR_SIZE), %esi
> + subl %eax, %esi
> # endif
> - andq $-VEC_SIZE, %rdi
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - /* Remove the leading bytes. */
> - sarxl %SHIFT_REG, %eax, %eax
> - testl %eax, %eax
> - jz L(aligned_more)
> - tzcntl %eax, %eax
> # ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - sall $2, %eax
> + andl $(CHAR_PER_VEC - 1), %eax
> # endif
> + /* Remove the leading bytes. */
> + sarxl %eax, %r8d, %eax
> # ifndef USE_AS_RAWMEMCHR
> /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> + cmpq %rsi, %rdx
> + jbe L(first_vec_x0)
> +# endif
> + testl %eax, %eax
> + jz L(cross_page_continue)
> + tzcntl %eax, %eax
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
> +# else
> + addq %RAW_PTR_REG, %rax
> # endif
> - addq %rdi, %rax
> - addq %rcx, %rax
> ret
>
> .p2align 4
> -L(aligned_more):
> -# ifndef USE_AS_RAWMEMCHR
> - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> - overflow. */
> - negq %rcx
> - addq $VEC_SIZE, %rcx
> +L(first_vec_x1):
> + tzcntl %eax, %eax
> + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> - /* Check the end of data. */
> - subq %rcx, %rdx
> - jbe L(zero)
> -# endif
> + .p2align 4
> +L(first_vec_x2):
> + tzcntl %eax, %eax
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> - addq $VEC_SIZE, %rdi
> + .p2align 4
> +L(first_vec_x3):
> + tzcntl %eax, %eax
> + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> -# ifndef USE_AS_RAWMEMCHR
> - subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> -# endif
> + .p2align 4
> +L(first_vec_x4):
> + tzcntl %eax, %eax
> + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> -L(more_4x_vec):
> + .p2align 5
> +L(aligned_more):
> /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> since data is only aligned to VEC_SIZE. */
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
>
> - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> + /* Align data to VEC_SIZE. */
> +L(cross_page_continue):
> + xorl %ecx, %ecx
> + subl %edi, %ecx
> + andq $-VEC_SIZE, %rdi
> + /* esi is for adjusting length to see if near the end. */
> + leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Divide bytes by 4 to get the wchar_t count. */
> + sarl $2, %esi
> +# endif
> +# else
> + andq $-VEC_SIZE, %rdi
> +L(cross_page_continue):
> +# endif
> + /* Load first VEC regardless. */
> + VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> +# ifndef USE_AS_RAWMEMCHR
> + /* Adjust length. If near end handle specially. */
> + subq %rsi, %rdx
> + jbe L(last_4x_vec_or_less)
> +# endif
> testl %eax, %eax
> jnz L(first_vec_x1)
>
> - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> testl %eax, %eax
> jnz L(first_vec_x2)
>
> - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> testl %eax, %eax
> jnz L(first_vec_x3)
>
> - addq $(VEC_SIZE * 4), %rdi
> + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + testl %eax, %eax
> + jnz L(first_vec_x4)
> +
>
> # ifndef USE_AS_RAWMEMCHR
> - subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> -# endif
> + /* Check if at last CHAR_PER_VEC * 4 length. */
> + subq $(CHAR_PER_VEC * 4), %rdx
> + jbe L(last_4x_vec_or_less_cmpeq)
> + addq $VEC_SIZE, %rdi
>
> - /* Align data to 4 * VEC_SIZE. */
> - movq %rdi, %rcx
> - andl $(4 * VEC_SIZE - 1), %ecx
> + /* Align data to VEC_SIZE * 4 for the loop and readjust length.
> + */
> +# ifdef USE_AS_WMEMCHR
> + movl %edi, %ecx
> andq $-(4 * VEC_SIZE), %rdi
> -
> -# ifndef USE_AS_RAWMEMCHR
> - /* Adjust length. */
> + andl $(VEC_SIZE * 4 - 1), %ecx
> + /* NB: Divide bytes by 4 to get the wchar_t count. */
> + sarl $2, %ecx
> addq %rcx, %rdx
> +# else
> + addq %rdi, %rdx
> + andq $-(4 * VEC_SIZE), %rdi
> + subq %rdi, %rdx
> +# endif
> +# else
> + addq $VEC_SIZE, %rdi
> + andq $-(4 * VEC_SIZE), %rdi
> # endif
>
> + vpxorq %XZERO, %XZERO, %XZERO
> +
> + /* Compare 4 * VEC at a time forward. */
> .p2align 4
> L(loop_4x_vec):
> - /* Compare 4 * VEC at a time forward. */
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> - kord %k1, %k2, %k5
> - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> -
> - kord %k3, %k4, %k6
> - kortestd %k5, %k6
> - jnz L(4x_vec_end)
> -
> - addq $(VEC_SIZE * 4), %rdi
> -
> + /* It would be possible to save some instructions using 4x VPCMP
> + but bottleneck on port 5 makes it not woth it. */
> + VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
> + /* xor will set bytes match esi to zero. */
> + vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
> + vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
> + VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
> + /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
> + VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
> + VPCMP $0, %YMM3, %YZERO, %k2
> # ifdef USE_AS_RAWMEMCHR
> - jmp L(loop_4x_vec)
> + subq $-(VEC_SIZE * 4), %rdi
> + kortestd %k2, %k3
> + jz L(loop_4x_vec)
> # else
> - subq $(VEC_SIZE * 4), %rdx
> + kortestd %k2, %k3
> + jnz L(loop_4x_vec_end)
> +
> + subq $-(VEC_SIZE * 4), %rdi
> +
> + subq $(CHAR_PER_VEC * 4), %rdx
> ja L(loop_4x_vec)
>
> + /* Fall through into less than 4 remaining vectors of length case.
> + */
> + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + addq $(VEC_SIZE * 3), %rdi
> + .p2align 4
> L(last_4x_vec_or_less):
> - /* Less than 4 * VEC and aligned to VEC_SIZE. */
> - addl $(VEC_SIZE * 2), %edx
> - jle L(last_2x_vec)
> -
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> + /* Check if first VEC contained match. */
> testl %eax, %eax
> - jnz L(first_vec_x0)
> + jnz L(first_vec_x1_check)
>
> - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x1)
> + /* If remaining length > CHAR_PER_VEC * 2. */
> + addl $(CHAR_PER_VEC * 2), %edx
> + jg L(last_4x_vec)
>
> - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> +L(last_2x_vec):
> + /* If remaining length < CHAR_PER_VEC. */
> + addl $CHAR_PER_VEC, %edx
> + jle L(zero_end)
>
> - jnz L(first_vec_x2_check)
> - subl $VEC_SIZE, %edx
> - jle L(zero)
> + /* Check VEC2 and compare any match with remaining length. */
> + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + tzcntl %eax, %eax
> + cmpl %eax, %edx
> + jbe L(set_zero_end)
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +L(zero_end):
> + ret
>
> - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
>
> - jnz L(first_vec_x3_check)
> + .p2align 4
> +L(first_vec_x1_check):
> + tzcntl %eax, %eax
> + /* Adjust length. */
> + subl $-(CHAR_PER_VEC * 4), %edx
> + /* Check if match within remaining length. */
> + cmpl %eax, %edx
> + jbe L(set_zero_end)
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> + ret
> +L(set_zero_end):
> xorl %eax, %eax
> ret
>
> .p2align 4
> -L(last_2x_vec):
> - addl $(VEC_SIZE * 2), %edx
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> +L(loop_4x_vec_end):
> +# endif
> + /* rawmemchr will fall through into this if match was found in
> + loop. */
> +
> + /* k1 has not of matches with VEC1. */
> kmovd %k1, %eax
> - testl %eax, %eax
> +# ifdef USE_AS_WMEMCHR
> + subl $((1 << CHAR_PER_VEC) - 1), %eax
> +# else
> + incl %eax
> +# endif
> + jnz L(last_vec_x1_return)
>
> - jnz L(first_vec_x0_check)
> - subl $VEC_SIZE, %edx
> - jle L(zero)
> + VPCMP $0, %YMM2, %YZERO, %k0
> + kmovd %k0, %eax
> + testl %eax, %eax
> + jnz L(last_vec_x2_return)
>
> - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> + kmovd %k2, %eax
> testl %eax, %eax
> - jnz L(first_vec_x1_check)
> - xorl %eax, %eax
> - ret
> + jnz L(last_vec_x3_return)
>
> - .p2align 4
> -L(first_vec_x0_check):
> + kmovd %k3, %eax
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - sall $2, %eax
> +# ifdef USE_AS_RAWMEMCHR
> + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +# else
> + leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
> # endif
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq %rdi, %rax
> ret
>
> .p2align 4
> -L(first_vec_x1_check):
> +L(last_vec_x1_return):
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - sall $2, %eax
> -# endif
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $VEC_SIZE, %rax
> +# ifdef USE_AS_RAWMEMCHR
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> addq %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(first_vec_x2_check):
> - tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - sall $2, %eax
> +# endif
> +# else
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> # endif
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $(VEC_SIZE * 2), %rax
> - addq %rdi, %rax
> ret
>
> .p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x2_return):
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - sall $2, %eax
> +# ifdef USE_AS_RAWMEMCHR
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +# else
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
> # endif
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $(VEC_SIZE * 3), %rax
> - addq %rdi, %rax
> ret
>
> .p2align 4
> -L(zero):
> - xorl %eax, %eax
> - ret
> -# endif
> -
> - .p2align 4
> -L(first_vec_x0):
> +L(last_vec_x3_return):
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq (%rdi, %rax, 4), %rax
> +# ifdef USE_AS_RAWMEMCHR
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> # else
> - addq %rdi, %rax
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
> # endif
> ret
>
> +
> +# ifndef USE_AS_RAWMEMCHR
> +L(last_4x_vec_or_less_cmpeq):
> + VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + subq $-(VEC_SIZE * 4), %rdi
> + /* Check first VEC regardless. */
> + testl %eax, %eax
> + jnz L(first_vec_x1_check)
> +
> + /* If remaining length <= CHAR_PER_VEC * 2. */
> + addl $(CHAR_PER_VEC * 2), %edx
> + jle L(last_2x_vec)
> +
> .p2align 4
> -L(first_vec_x1):
> +L(last_4x_vec):
> + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + testl %eax, %eax
> + jnz L(last_vec_x2)
> +
> +
> + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + /* Create mask for possible matches within remaining length. */
> +# ifdef USE_AS_WMEMCHR
> + movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
> + bzhil %edx, %ecx, %ecx
> +# else
> + movq $-1, %rcx
> + bzhiq %rdx, %rcx, %rcx
> +# endif
> + /* Test matches in data against length match. */
> + andl %ecx, %eax
> + jnz L(last_vec_x3)
> +
> + /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
> + remaining length was found to be > CHAR_PER_VEC * 2. */
> + subl $CHAR_PER_VEC, %edx
> + jbe L(zero_end2)
> +
> +
> + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + /* Shift remaining length mask for last VEC. */
> +# ifdef USE_AS_WMEMCHR
> + shrl $CHAR_PER_VEC, %ecx
> +# else
> + shrq $CHAR_PER_VEC, %rcx
> +# endif
> + andl %ecx, %eax
> + jz L(zero_end2)
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq VEC_SIZE(%rdi, %rax, 4), %rax
> -# else
> - addq $VEC_SIZE, %rax
> - addq %rdi, %rax
> -# endif
> + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> +L(zero_end2):
> ret
>
> - .p2align 4
> -L(first_vec_x2):
> +L(last_vec_x2):
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
> -# else
> - addq $(VEC_SIZE * 2), %rax
> - addq %rdi, %rax
> -# endif
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> ret
>
> .p2align 4
> -L(4x_vec_end):
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
> - kmovd %k2, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x1)
> - kmovd %k3, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x2)
> - kmovd %k4, %eax
> - testl %eax, %eax
> -L(first_vec_x3):
> +L(last_vec_x3):
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
> -# else
> - addq $(VEC_SIZE * 3), %rax
> - addq %rdi, %rax
> -# endif
> + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> ret
> +# endif
>
> END (MEMCHR)
> #endif
> --
> 2.29.2
>
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 2/3] x86: Optimize memchr-avx2.S
2021-05-03 22:25 ` [PATCH v2 2/3] x86: Optimize memchr-avx2.S H.J. Lu
@ 2021-05-03 22:58 ` Noah Goldstein
0 siblings, 0 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 22:58 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Mon, May 3, 2021 at 6:25 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, May 03, 2021 at 04:06:54PM -0400, Noah Goldstein wrote:
> > No bug. This commit optimizes memchr-avx2.S. The optimizations include
> > replacing some branches with cmovcc, avoiding some branches entirely
> > in the less_4x_vec case, making the page cross logic less strict,
> > asaving a few instructions the in loop return loop. test-memchr,
> > test-rawmemchr, and test-wmemchr are all passing.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> > sysdeps/x86_64/multiarch/memchr-avx2.S | 426 ++++++++++++++-----------
> > 1 file changed, 247 insertions(+), 179 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > index 1fcb1c350f..8b862fb9d1 100644
> > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > @@ -26,8 +26,22 @@
> >
> > # ifdef USE_AS_WMEMCHR
> > # define VPCMPEQ vpcmpeqd
> > +# define VPBROADCAST vpbroadcastd
> > +# define CHAR_SIZE 4
> > # else
> > # define VPCMPEQ vpcmpeqb
> > +# define VPBROADCAST vpbroadcastb
> > +# define CHAR_SIZE 1
> > +# endif
> > +
> > +# ifdef USE_AS_RAWMEMCHR
> > +# define ERAW_PTR_REG ecx
> > +# define RRAW_PTR_REG rcx
> > +# define ALGN_PTR_REG rdi
> > +# else
> > +# define ERAW_PTR_REG edi
> > +# define RRAW_PTR_REG rdi
> > +# define ALGN_PTR_REG rcx
> > # endif
> >
> > # ifndef VZEROUPPER
> > @@ -39,6 +53,7 @@
> > # endif
> >
> > # define VEC_SIZE 32
> > +# define PAGE_SIZE 4096
> >
> > .section SECTION(.text),"ax",@progbits
> > ENTRY (MEMCHR)
> > @@ -47,295 +62,348 @@ ENTRY (MEMCHR)
> > test %RDX_LP, %RDX_LP
> > jz L(null)
> > # endif
> > - movl %edi, %ecx
> > - /* Broadcast CHAR to YMM0. */
> > - vmovd %esi, %xmm0
> > # ifdef USE_AS_WMEMCHR
> > shl $2, %RDX_LP
> > - vpbroadcastd %xmm0, %ymm0
> > # else
> > # ifdef __ILP32__
> > /* Clear the upper 32 bits. */
> > movl %edx, %edx
> > # endif
> > - vpbroadcastb %xmm0, %ymm0
> > # endif
> > + /* Broadcast CHAR to YMMMATCH. */
> > + vmovd %esi, %xmm0
> > + VPBROADCAST %xmm0, %ymm0
> > /* Check if we may cross page boundary with one vector load. */
> > - andl $(2 * VEC_SIZE - 1), %ecx
> > - cmpl $VEC_SIZE, %ecx
> > - ja L(cros_page_boundary)
> > + movl %edi, %eax
> > + andl $(PAGE_SIZE - 1), %eax
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > + ja L(cross_page_boundary)
> >
> > /* Check the first VEC_SIZE bytes. */
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > + VPCMPEQ (%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > -
> > # ifndef USE_AS_RAWMEMCHR
> > - jnz L(first_vec_x0_check)
> > - /* Adjust length and check the end of data. */
> > - subq $VEC_SIZE, %rdx
> > - jbe L(zero)
> > -# else
> > - jnz L(first_vec_x0)
> > + /* If length < CHAR_PER_VEC handle special. */
> > + cmpq $VEC_SIZE, %rdx
> > + jbe L(first_vec_x0)
> > # endif
> > -
> > - /* Align data for aligned loads in the loop. */
> > - addq $VEC_SIZE, %rdi
> > - andl $(VEC_SIZE - 1), %ecx
> > - andq $-VEC_SIZE, %rdi
> > + testl %eax, %eax
> > + jz L(aligned_more)
> > + tzcntl %eax, %eax
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> >
> > # ifndef USE_AS_RAWMEMCHR
> > - /* Adjust length. */
> > - addq %rcx, %rdx
> > -
> > - subq $(VEC_SIZE * 4), %rdx
> > - jbe L(last_4x_vec_or_less)
> > + .p2align 5
> > +L(first_vec_x0):
> > + /* Check if first match was before length. */
> > + tzcntl %eax, %eax
> > + xorl %ecx, %ecx
> > + cmpl %eax, %edx
> > + leaq (%rdi, %rax), %rax
> > + cmovle %rcx, %rax
> > + VZEROUPPER_RETURN
>
> Please add a blank line here to indicate this begin a new block.
Done.
> OK with this change. You should be able to push it yourself now.
Will do. Thanks!
>
> Thanks.
>
> > +L(null):
> > + xorl %eax, %eax
> > + ret
> > # endif
> > - jmp L(more_4x_vec)
> > -
> > .p2align 4
> > -L(cros_page_boundary):
> > - andl $(VEC_SIZE - 1), %ecx
> > - andq $-VEC_SIZE, %rdi
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > +L(cross_page_boundary):
> > + /* Save pointer before aligning as its original value is necessary
> > + for computer return address if byte is found or adjusting length
> > + if it is not and this is memchr. */
> > + movq %rdi, %rcx
> > + /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
> > + rdi for rawmemchr. */
> > + orq $(VEC_SIZE - 1), %ALGN_PTR_REG
> > + VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > + /* Calculate length until end of page (length checked for a
> > + match). */
> > + leaq 1(%ALGN_PTR_REG), %rsi
> > + subq %RRAW_PTR_REG, %rsi
> > +# endif
> > /* Remove the leading bytes. */
> > - sarl %cl, %eax
> > - testl %eax, %eax
> > - jz L(aligned_more)
> > - tzcntl %eax, %eax
> > + sarxl %ERAW_PTR_REG, %eax, %eax
> > # ifndef USE_AS_RAWMEMCHR
> > /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > + cmpq %rsi, %rdx
> > + jbe L(first_vec_x0)
> > # endif
> > - addq %rdi, %rax
> > - addq %rcx, %rax
> > + testl %eax, %eax
> > + jz L(cross_page_continue)
> > + tzcntl %eax, %eax
> > + addq %RRAW_PTR_REG, %rax
> > L(return_vzeroupper):
> > ZERO_UPPER_VEC_REGISTERS_RETURN
> >
> > .p2align 4
> > -L(aligned_more):
> > -# ifndef USE_AS_RAWMEMCHR
> > - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> > - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> > - overflow. */
> > - negq %rcx
> > - addq $VEC_SIZE, %rcx
> > +L(first_vec_x1):
> > + tzcntl %eax, %eax
> > + incq %rdi
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> >
> > - /* Check the end of data. */
> > - subq %rcx, %rdx
> > - jbe L(zero)
> > -# endif
> > + .p2align 4
> > +L(first_vec_x2):
> > + tzcntl %eax, %eax
> > + addq $(VEC_SIZE + 1), %rdi
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> >
> > - addq $VEC_SIZE, %rdi
> > + .p2align 4
> > +L(first_vec_x3):
> > + tzcntl %eax, %eax
> > + addq $(VEC_SIZE * 2 + 1), %rdi
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> >
> > -# ifndef USE_AS_RAWMEMCHR
> > - subq $(VEC_SIZE * 4), %rdx
> > - jbe L(last_4x_vec_or_less)
> > -# endif
> >
> > -L(more_4x_vec):
> > + .p2align 4
> > +L(first_vec_x4):
> > + tzcntl %eax, %eax
> > + addq $(VEC_SIZE * 3 + 1), %rdi
> > + addq %rdi, %rax
> > + VZEROUPPER_RETURN
> > +
> > + .p2align 4
> > +L(aligned_more):
> > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > since data is only aligned to VEC_SIZE. */
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x0)
> >
> > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > +# ifndef USE_AS_RAWMEMCHR
> > +L(cross_page_continue):
> > + /* Align data to VEC_SIZE - 1. */
> > + xorl %ecx, %ecx
> > + subl %edi, %ecx
> > + orq $(VEC_SIZE - 1), %rdi
> > + /* esi is for adjusting length to see if near the end. */
> > + leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> > +# else
> > + orq $(VEC_SIZE - 1), %rdi
> > +L(cross_page_continue):
> > +# endif
> > + /* Load first VEC regardless. */
> > + VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > + /* Adjust length. If near end handle specially. */
> > + subq %rsi, %rdx
> > + jbe L(last_4x_vec_or_less)
> > +# endif
> > testl %eax, %eax
> > jnz L(first_vec_x1)
> >
> > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x2)
> >
> > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x3)
> >
> > - addq $(VEC_SIZE * 4), %rdi
> > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > + vpmovmskb %ymm1, %eax
> > + testl %eax, %eax
> > + jnz L(first_vec_x4)
> >
> > # ifndef USE_AS_RAWMEMCHR
> > + /* Check if at last VEC_SIZE * 4 length. */
> > subq $(VEC_SIZE * 4), %rdx
> > - jbe L(last_4x_vec_or_less)
> > -# endif
> > -
> > - /* Align data to 4 * VEC_SIZE. */
> > - movq %rdi, %rcx
> > - andl $(4 * VEC_SIZE - 1), %ecx
> > - andq $-(4 * VEC_SIZE), %rdi
> > -
> > -# ifndef USE_AS_RAWMEMCHR
> > - /* Adjust length. */
> > + jbe L(last_4x_vec_or_less_cmpeq)
> > + /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
> > + length. */
> > + incq %rdi
> > + movl %edi, %ecx
> > + orq $(VEC_SIZE * 4 - 1), %rdi
> > + andl $(VEC_SIZE * 4 - 1), %ecx
> > addq %rcx, %rdx
> > +# else
> > + /* Align data to VEC_SIZE * 4 - 1 for loop. */
> > + incq %rdi
> > + orq $(VEC_SIZE * 4 - 1), %rdi
> > # endif
> >
> > + /* Compare 4 * VEC at a time forward. */
> > .p2align 4
> > L(loop_4x_vec):
> > - /* Compare 4 * VEC at a time forward. */
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
> > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
> > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
> > -
> > + VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
> > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
> > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
> > vpor %ymm1, %ymm2, %ymm5
> > vpor %ymm3, %ymm4, %ymm6
> > vpor %ymm5, %ymm6, %ymm5
> >
> > - vpmovmskb %ymm5, %eax
> > - testl %eax, %eax
> > - jnz L(4x_vec_end)
> > -
> > - addq $(VEC_SIZE * 4), %rdi
> > -
> > + vpmovmskb %ymm5, %ecx
> > # ifdef USE_AS_RAWMEMCHR
> > - jmp L(loop_4x_vec)
> > + subq $-(VEC_SIZE * 4), %rdi
> > + testl %ecx, %ecx
> > + jz L(loop_4x_vec)
> > # else
> > - subq $(VEC_SIZE * 4), %rdx
> > - ja L(loop_4x_vec)
> > + testl %ecx, %ecx
> > + jnz L(loop_4x_vec_end)
> >
> > -L(last_4x_vec_or_less):
> > - /* Less than 4 * VEC and aligned to VEC_SIZE. */
> > - addl $(VEC_SIZE * 2), %edx
> > - jle L(last_2x_vec)
> > + subq $-(VEC_SIZE * 4), %rdi
> >
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x0)
> > + subq $(VEC_SIZE * 4), %rdx
> > + ja L(loop_4x_vec)
> >
> > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > + /* Fall through into less than 4 remaining vectors of length case.
> > + */
> > + VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > + .p2align 4
> > +L(last_4x_vec_or_less):
> > + /* Check if first VEC contained match. */
> > testl %eax, %eax
> > - jnz L(first_vec_x1)
> > + jnz L(first_vec_x1_check)
> >
> > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > + /* If remaining length > VEC_SIZE * 2. */
> > + addl $(VEC_SIZE * 2), %edx
> > + jg L(last_4x_vec)
> >
> > - jnz L(first_vec_x2_check)
> > - subl $VEC_SIZE, %edx
> > - jle L(zero)
> > +L(last_2x_vec):
> > + /* If remaining length < VEC_SIZE. */
> > + addl $VEC_SIZE, %edx
> > + jle L(zero_end)
> >
> > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > + /* Check VEC2 and compare any match with remaining length. */
> > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > -
> > - jnz L(first_vec_x3_check)
> > - xorl %eax, %eax
> > + tzcntl %eax, %eax
> > + cmpl %eax, %edx
> > + jbe L(set_zero_end)
> > + addq $(VEC_SIZE + 1), %rdi
> > + addq %rdi, %rax
> > +L(zero_end):
> > VZEROUPPER_RETURN
> >
> > .p2align 4
> > -L(last_2x_vec):
> > - addl $(VEC_SIZE * 2), %edx
> > - VPCMPEQ (%rdi), %ymm0, %ymm1
> > +L(loop_4x_vec_end):
> > +# endif
> > + /* rawmemchr will fall through into this if match was found in
> > + loop. */
> > +
> > vpmovmskb %ymm1, %eax
> > testl %eax, %eax
> > + jnz L(last_vec_x1_return)
> >
> > - jnz L(first_vec_x0_check)
> > - subl $VEC_SIZE, %edx
> > - jle L(zero)
> > -
> > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > - vpmovmskb %ymm1, %eax
> > + vpmovmskb %ymm2, %eax
> > testl %eax, %eax
> > - jnz L(first_vec_x1_check)
> > - xorl %eax, %eax
> > - VZEROUPPER_RETURN
> > + jnz L(last_vec_x2_return)
> >
> > - .p2align 4
> > -L(first_vec_x0_check):
> > - tzcntl %eax, %eax
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > + vpmovmskb %ymm3, %eax
> > + /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
> > + salq $32, %rcx
> > + orq %rcx, %rax
> > + tzcntq %rax, %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > + subq $(VEC_SIZE * 2 - 1), %rdi
> > +# else
> > + subq $-(VEC_SIZE * 2 + 1), %rdi
> > +# endif
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> > +# ifndef USE_AS_RAWMEMCHR
> >
> > .p2align 4
> > L(first_vec_x1_check):
> > tzcntl %eax, %eax
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq $VEC_SIZE, %rax
> > + /* Adjust length. */
> > + subl $-(VEC_SIZE * 4), %edx
> > + /* Check if match within remaining length. */
> > + cmpl %eax, %edx
> > + jbe L(set_zero_end)
> > + incq %rdi
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> > + .p2align 4
> > +L(set_zero_end):
> > + xorl %eax, %eax
> > + VZEROUPPER_RETURN
> > +# endif
> >
> > .p2align 4
> > -L(first_vec_x2_check):
> > +L(last_vec_x1_return):
> > tzcntl %eax, %eax
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq $(VEC_SIZE * 2), %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > + subq $(VEC_SIZE * 4 - 1), %rdi
> > +# else
> > + incq %rdi
> > +# endif
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> >
> > .p2align 4
> > -L(first_vec_x3_check):
> > +L(last_vec_x2_return):
> > tzcntl %eax, %eax
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq $(VEC_SIZE * 3), %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > + subq $(VEC_SIZE * 3 - 1), %rdi
> > +# else
> > + subq $-(VEC_SIZE + 1), %rdi
> > +# endif
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> >
> > +# ifndef USE_AS_RAWMEMCHR
> > .p2align 4
> > -L(zero):
> > - xorl %eax, %eax
> > - jmp L(return_vzeroupper)
> > +L(last_4x_vec_or_less_cmpeq):
> > + VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
> > + vpmovmskb %ymm1, %eax
> > + subq $-(VEC_SIZE * 4), %rdi
> > + /* Check first VEC regardless. */
> > + testl %eax, %eax
> > + jnz L(first_vec_x1_check)
> >
> > + /* If remaining length <= CHAR_PER_VEC * 2. */
> > + addl $(VEC_SIZE * 2), %edx
> > + jle L(last_2x_vec)
> > .p2align 4
> > -L(null):
> > - xorl %eax, %eax
> > - ret
> > -# endif
> > +L(last_4x_vec):
> > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > + vpmovmskb %ymm1, %eax
> > + testl %eax, %eax
> > + jnz L(last_vec_x2_return)
> >
> > - .p2align 4
> > -L(first_vec_x0):
> > - tzcntl %eax, %eax
> > - addq %rdi, %rax
> > - VZEROUPPER_RETURN
> > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > + vpmovmskb %ymm1, %eax
> >
> > - .p2align 4
> > -L(first_vec_x1):
> > - tzcntl %eax, %eax
> > - addq $VEC_SIZE, %rax
> > - addq %rdi, %rax
> > - VZEROUPPER_RETURN
> > + /* Create mask for possible matches within remaining length. */
> > + movq $-1, %rcx
> > + bzhiq %rdx, %rcx, %rcx
> >
> > - .p2align 4
> > -L(first_vec_x2):
> > + /* Test matches in data against length match. */
> > + andl %ecx, %eax
> > + jnz L(last_vec_x3)
> > +
> > + /* if remaining length <= VEC_SIZE * 3 (Note this is after
> > + remaining length was found to be > VEC_SIZE * 2. */
> > + subl $VEC_SIZE, %edx
> > + jbe L(zero_end2)
> > +
> > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > + vpmovmskb %ymm1, %eax
> > + /* Shift remaining length mask for last VEC. */
> > + shrq $32, %rcx
> > + andl %ecx, %eax
> > + jz L(zero_end2)
> > tzcntl %eax, %eax
> > - addq $(VEC_SIZE * 2), %rax
> > + addq $(VEC_SIZE * 3 + 1), %rdi
> > addq %rdi, %rax
> > +L(zero_end2):
> > VZEROUPPER_RETURN
> >
> > .p2align 4
> > -L(4x_vec_end):
> > - vpmovmskb %ymm1, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x0)
> > - vpmovmskb %ymm2, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x1)
> > - vpmovmskb %ymm3, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x2)
> > - vpmovmskb %ymm4, %eax
> > - testl %eax, %eax
> > -L(first_vec_x3):
> > +L(last_vec_x3):
> > tzcntl %eax, %eax
> > - addq $(VEC_SIZE * 3), %rax
> > + subq $-(VEC_SIZE * 2 + 1), %rdi
> > addq %rdi, %rax
> > VZEROUPPER_RETURN
> > +# endif
> >
> > END (MEMCHR)
> > #endif
> > --
> > 2.29.2
> >
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v2 3/3] x86: Optimize memchr-evex.S
2021-05-03 22:26 ` H.J. Lu
@ 2021-05-03 22:58 ` Noah Goldstein
0 siblings, 0 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 22:58 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Mon, May 3, 2021 at 6:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, May 03, 2021 at 04:06:55PM -0400, Noah Goldstein wrote:
> > No bug. This commit optimizes memchr-evex.S. The optimizations include
> > replacing some branches with cmovcc, avoiding some branches entirely
> > in the less_4x_vec case, making the page cross logic less strict,
> > saving some ALU in the alignment process, and most importantly
> > increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
> > test-wmemchr are all passing.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> > sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
> > 1 file changed, 322 insertions(+), 225 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> > index 6dd5d67b90..147d7aa8ee 100644
> > --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> > +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> > @@ -26,14 +26,28 @@
> >
> > # ifdef USE_AS_WMEMCHR
> > # define VPBROADCAST vpbroadcastd
> > -# define VPCMP vpcmpd
> > -# define SHIFT_REG r8d
> > +# define VPMINU vpminud
> > +# define VPCMP vpcmpd
> > +# define VPCMPEQ vpcmpeqd
> > +# define CHAR_SIZE 4
> > # else
> > # define VPBROADCAST vpbroadcastb
> > -# define VPCMP vpcmpb
> > -# define SHIFT_REG ecx
> > +# define VPMINU vpminub
> > +# define VPCMP vpcmpb
> > +# define VPCMPEQ vpcmpeqb
> > +# define CHAR_SIZE 1
> > # endif
> >
> > +# ifdef USE_AS_RAWMEMCHR
> > +# define RAW_PTR_REG rcx
> > +# define ALGN_PTR_REG rdi
> > +# else
> > +# define RAW_PTR_REG rdi
> > +# define ALGN_PTR_REG rcx
> > +# endif
> > +
> > +# define XZERO xmm23
> > +# define YZERO ymm23
>
> Please rename XZERO/YZERO to XMMZERO/YMMZERO. OK with this change.
Done and thanks!
>
> Thanks.
>
> > # define XMMMATCH xmm16
> > # define YMMMATCH ymm16
> > # define YMM1 ymm17
> > @@ -44,6 +58,8 @@
> > # define YMM6 ymm22
> >
> > # define VEC_SIZE 32
> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> > +# define PAGE_SIZE 4096
> >
> > .section .text.evex,"ax",@progbits
> > ENTRY (MEMCHR)
> > @@ -51,11 +67,7 @@ ENTRY (MEMCHR)
> > /* Check for zero length. */
> > test %RDX_LP, %RDX_LP
> > jz L(zero)
> > -# endif
> > - movl %edi, %ecx
> > -# ifdef USE_AS_WMEMCHR
> > - shl $2, %RDX_LP
> > -# else
> > +
> > # ifdef __ILP32__
> > /* Clear the upper 32 bits. */
> > movl %edx, %edx
> > @@ -64,318 +76,403 @@ ENTRY (MEMCHR)
> > /* Broadcast CHAR to YMMMATCH. */
> > VPBROADCAST %esi, %YMMMATCH
> > /* Check if we may cross page boundary with one vector load. */
> > - andl $(2 * VEC_SIZE - 1), %ecx
> > - cmpl $VEC_SIZE, %ecx
> > - ja L(cros_page_boundary)
> > + movl %edi, %eax
> > + andl $(PAGE_SIZE - 1), %eax
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > + ja L(cross_page_boundary)
> >
> > /* Check the first VEC_SIZE bytes. */
> > - VPCMP $0, (%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > -
> > + VPCMP $0, (%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > # ifndef USE_AS_RAWMEMCHR
> > - jnz L(first_vec_x0_check)
> > - /* Adjust length and check the end of data. */
> > - subq $VEC_SIZE, %rdx
> > - jbe L(zero)
> > + /* If length < CHAR_PER_VEC handle special. */
> > + cmpq $CHAR_PER_VEC, %rdx
> > + jbe L(first_vec_x0)
> > +# endif
> > + testl %eax, %eax
> > + jz L(aligned_more)
> > + tzcntl %eax, %eax
> > +# ifdef USE_AS_WMEMCHR
> > + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> > + leaq (%rdi, %rax, CHAR_SIZE), %rax
> > # else
> > - jnz L(first_vec_x0)
> > + addq %rdi, %rax
> > # endif
> > -
> > - /* Align data for aligned loads in the loop. */
> > - addq $VEC_SIZE, %rdi
> > - andl $(VEC_SIZE - 1), %ecx
> > - andq $-VEC_SIZE, %rdi
> > + ret
> >
> > # ifndef USE_AS_RAWMEMCHR
> > - /* Adjust length. */
> > - addq %rcx, %rdx
> > -
> > - subq $(VEC_SIZE * 4), %rdx
> > - jbe L(last_4x_vec_or_less)
> > -# endif
> > - jmp L(more_4x_vec)
> > +L(zero):
> > + xorl %eax, %eax
> > + ret
> >
> > + .p2align 5
> > +L(first_vec_x0):
> > + /* Check if first match was before length. */
> > + tzcntl %eax, %eax
> > + xorl %ecx, %ecx
> > + cmpl %eax, %edx
> > + leaq (%rdi, %rax, CHAR_SIZE), %rax
> > + cmovle %rcx, %rax
> > + ret
> > +# else
> > + /* NB: first_vec_x0 is 17 bytes which will leave
> > + cross_page_boundary (which is relatively cold) close enough
> > + to ideal alignment. So only realign L(cross_page_boundary) if
> > + rawmemchr. */
> > .p2align 4
> > -L(cros_page_boundary):
> > - andl $(VEC_SIZE - 1), %ecx
> > +# endif
> > +L(cross_page_boundary):
> > + /* Save pointer before aligning as its original value is
> > + necessary for computer return address if byte is found or
> > + adjusting length if it is not and this is memchr. */
> > + movq %rdi, %rcx
> > + /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
> > + for rawmemchr. */
> > + andq $-VEC_SIZE, %ALGN_PTR_REG
> > + VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
> > + kmovd %k0, %r8d
> > # ifdef USE_AS_WMEMCHR
> > - /* NB: Divide shift count by 4 since each bit in K1 represent 4
> > + /* NB: Divide shift count by 4 since each bit in K0 represent 4
> > bytes. */
> > - movl %ecx, %SHIFT_REG
> > - sarl $2, %SHIFT_REG
> > + sarl $2, %eax
> > +# endif
> > +# ifndef USE_AS_RAWMEMCHR
> > + movl $(PAGE_SIZE / CHAR_SIZE), %esi
> > + subl %eax, %esi
> > # endif
> > - andq $-VEC_SIZE, %rdi
> > - VPCMP $0, (%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > - /* Remove the leading bytes. */
> > - sarxl %SHIFT_REG, %eax, %eax
> > - testl %eax, %eax
> > - jz L(aligned_more)
> > - tzcntl %eax, %eax
> > # ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - sall $2, %eax
> > + andl $(CHAR_PER_VEC - 1), %eax
> > # endif
> > + /* Remove the leading bytes. */
> > + sarxl %eax, %r8d, %eax
> > # ifndef USE_AS_RAWMEMCHR
> > /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > + cmpq %rsi, %rdx
> > + jbe L(first_vec_x0)
> > +# endif
> > + testl %eax, %eax
> > + jz L(cross_page_continue)
> > + tzcntl %eax, %eax
> > +# ifdef USE_AS_WMEMCHR
> > + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> > + leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
> > +# else
> > + addq %RAW_PTR_REG, %rax
> > # endif
> > - addq %rdi, %rax
> > - addq %rcx, %rax
> > ret
> >
> > .p2align 4
> > -L(aligned_more):
> > -# ifndef USE_AS_RAWMEMCHR
> > - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> > - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> > - overflow. */
> > - negq %rcx
> > - addq $VEC_SIZE, %rcx
> > +L(first_vec_x1):
> > + tzcntl %eax, %eax
> > + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> >
> > - /* Check the end of data. */
> > - subq %rcx, %rdx
> > - jbe L(zero)
> > -# endif
> > + .p2align 4
> > +L(first_vec_x2):
> > + tzcntl %eax, %eax
> > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> >
> > - addq $VEC_SIZE, %rdi
> > + .p2align 4
> > +L(first_vec_x3):
> > + tzcntl %eax, %eax
> > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> >
> > -# ifndef USE_AS_RAWMEMCHR
> > - subq $(VEC_SIZE * 4), %rdx
> > - jbe L(last_4x_vec_or_less)
> > -# endif
> > + .p2align 4
> > +L(first_vec_x4):
> > + tzcntl %eax, %eax
> > + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> >
> > -L(more_4x_vec):
> > + .p2align 5
> > +L(aligned_more):
> > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> > since data is only aligned to VEC_SIZE. */
> > - VPCMP $0, (%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x0)
> >
> > - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > + /* Align data to VEC_SIZE. */
> > +L(cross_page_continue):
> > + xorl %ecx, %ecx
> > + subl %edi, %ecx
> > + andq $-VEC_SIZE, %rdi
> > + /* esi is for adjusting length to see if near the end. */
> > + leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
> > +# ifdef USE_AS_WMEMCHR
> > + /* NB: Divide bytes by 4 to get the wchar_t count. */
> > + sarl $2, %esi
> > +# endif
> > +# else
> > + andq $-VEC_SIZE, %rdi
> > +L(cross_page_continue):
> > +# endif
> > + /* Load first VEC regardless. */
> > + VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > + /* Adjust length. If near end handle specially. */
> > + subq %rsi, %rdx
> > + jbe L(last_4x_vec_or_less)
> > +# endif
> > testl %eax, %eax
> > jnz L(first_vec_x1)
> >
> > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x2)
> >
> > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > testl %eax, %eax
> > jnz L(first_vec_x3)
> >
> > - addq $(VEC_SIZE * 4), %rdi
> > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > + testl %eax, %eax
> > + jnz L(first_vec_x4)
> > +
> >
> > # ifndef USE_AS_RAWMEMCHR
> > - subq $(VEC_SIZE * 4), %rdx
> > - jbe L(last_4x_vec_or_less)
> > -# endif
> > + /* Check if at last CHAR_PER_VEC * 4 length. */
> > + subq $(CHAR_PER_VEC * 4), %rdx
> > + jbe L(last_4x_vec_or_less_cmpeq)
> > + addq $VEC_SIZE, %rdi
> >
> > - /* Align data to 4 * VEC_SIZE. */
> > - movq %rdi, %rcx
> > - andl $(4 * VEC_SIZE - 1), %ecx
> > + /* Align data to VEC_SIZE * 4 for the loop and readjust length.
> > + */
> > +# ifdef USE_AS_WMEMCHR
> > + movl %edi, %ecx
> > andq $-(4 * VEC_SIZE), %rdi
> > -
> > -# ifndef USE_AS_RAWMEMCHR
> > - /* Adjust length. */
> > + andl $(VEC_SIZE * 4 - 1), %ecx
> > + /* NB: Divide bytes by 4 to get the wchar_t count. */
> > + sarl $2, %ecx
> > addq %rcx, %rdx
> > +# else
> > + addq %rdi, %rdx
> > + andq $-(4 * VEC_SIZE), %rdi
> > + subq %rdi, %rdx
> > +# endif
> > +# else
> > + addq $VEC_SIZE, %rdi
> > + andq $-(4 * VEC_SIZE), %rdi
> > # endif
> >
> > + vpxorq %XZERO, %XZERO, %XZERO
> > +
> > + /* Compare 4 * VEC at a time forward. */
> > .p2align 4
> > L(loop_4x_vec):
> > - /* Compare 4 * VEC at a time forward. */
> > - VPCMP $0, (%rdi), %YMMMATCH, %k1
> > - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> > - kord %k1, %k2, %k5
> > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> > -
> > - kord %k3, %k4, %k6
> > - kortestd %k5, %k6
> > - jnz L(4x_vec_end)
> > -
> > - addq $(VEC_SIZE * 4), %rdi
> > -
> > + /* It would be possible to save some instructions using 4x VPCMP
> > + but bottleneck on port 5 makes it not woth it. */
> > + VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
> > + /* xor will set bytes match esi to zero. */
> > + vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
> > + vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
> > + VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
> > + /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
> > + VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
> > + VPCMP $0, %YMM3, %YZERO, %k2
> > # ifdef USE_AS_RAWMEMCHR
> > - jmp L(loop_4x_vec)
> > + subq $-(VEC_SIZE * 4), %rdi
> > + kortestd %k2, %k3
> > + jz L(loop_4x_vec)
> > # else
> > - subq $(VEC_SIZE * 4), %rdx
> > + kortestd %k2, %k3
> > + jnz L(loop_4x_vec_end)
> > +
> > + subq $-(VEC_SIZE * 4), %rdi
> > +
> > + subq $(CHAR_PER_VEC * 4), %rdx
> > ja L(loop_4x_vec)
> >
> > + /* Fall through into less than 4 remaining vectors of length case.
> > + */
> > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > + addq $(VEC_SIZE * 3), %rdi
> > + .p2align 4
> > L(last_4x_vec_or_less):
> > - /* Less than 4 * VEC and aligned to VEC_SIZE. */
> > - addl $(VEC_SIZE * 2), %edx
> > - jle L(last_2x_vec)
> > -
> > - VPCMP $0, (%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > + /* Check if first VEC contained match. */
> > testl %eax, %eax
> > - jnz L(first_vec_x0)
> > + jnz L(first_vec_x1_check)
> >
> > - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x1)
> > + /* If remaining length > CHAR_PER_VEC * 2. */
> > + addl $(CHAR_PER_VEC * 2), %edx
> > + jg L(last_4x_vec)
> >
> > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > +L(last_2x_vec):
> > + /* If remaining length < CHAR_PER_VEC. */
> > + addl $CHAR_PER_VEC, %edx
> > + jle L(zero_end)
> >
> > - jnz L(first_vec_x2_check)
> > - subl $VEC_SIZE, %edx
> > - jle L(zero)
> > + /* Check VEC2 and compare any match with remaining length. */
> > + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > + tzcntl %eax, %eax
> > + cmpl %eax, %edx
> > + jbe L(set_zero_end)
> > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +L(zero_end):
> > + ret
> >
> > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> >
> > - jnz L(first_vec_x3_check)
> > + .p2align 4
> > +L(first_vec_x1_check):
> > + tzcntl %eax, %eax
> > + /* Adjust length. */
> > + subl $-(CHAR_PER_VEC * 4), %edx
> > + /* Check if match within remaining length. */
> > + cmpl %eax, %edx
> > + jbe L(set_zero_end)
> > + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> > + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> > + ret
> > +L(set_zero_end):
> > xorl %eax, %eax
> > ret
> >
> > .p2align 4
> > -L(last_2x_vec):
> > - addl $(VEC_SIZE * 2), %edx
> > - VPCMP $0, (%rdi), %YMMMATCH, %k1
> > +L(loop_4x_vec_end):
> > +# endif
> > + /* rawmemchr will fall through into this if match was found in
> > + loop. */
> > +
> > + /* k1 has not of matches with VEC1. */
> > kmovd %k1, %eax
> > - testl %eax, %eax
> > +# ifdef USE_AS_WMEMCHR
> > + subl $((1 << CHAR_PER_VEC) - 1), %eax
> > +# else
> > + incl %eax
> > +# endif
> > + jnz L(last_vec_x1_return)
> >
> > - jnz L(first_vec_x0_check)
> > - subl $VEC_SIZE, %edx
> > - jle L(zero)
> > + VPCMP $0, %YMM2, %YZERO, %k0
> > + kmovd %k0, %eax
> > + testl %eax, %eax
> > + jnz L(last_vec_x2_return)
> >
> > - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > - kmovd %k1, %eax
> > + kmovd %k2, %eax
> > testl %eax, %eax
> > - jnz L(first_vec_x1_check)
> > - xorl %eax, %eax
> > - ret
> > + jnz L(last_vec_x3_return)
> >
> > - .p2align 4
> > -L(first_vec_x0_check):
> > + kmovd %k3, %eax
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - sall $2, %eax
> > +# ifdef USE_AS_RAWMEMCHR
> > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > + leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
> > # endif
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq %rdi, %rax
> > ret
> >
> > .p2align 4
> > -L(first_vec_x1_check):
> > +L(last_vec_x1_return):
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - sall $2, %eax
> > -# endif
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq $VEC_SIZE, %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > +# ifdef USE_AS_WMEMCHR
> > + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> > + leaq (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > addq %rdi, %rax
> > - ret
> > -
> > - .p2align 4
> > -L(first_vec_x2_check):
> > - tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - sall $2, %eax
> > +# endif
> > +# else
> > + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> > + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> > # endif
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq $(VEC_SIZE * 2), %rax
> > - addq %rdi, %rax
> > ret
> >
> > .p2align 4
> > -L(first_vec_x3_check):
> > +L(last_vec_x2_return):
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - sall $2, %eax
> > +# ifdef USE_AS_RAWMEMCHR
> > + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> > + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> > + leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
> > # endif
> > - /* Check the end of data. */
> > - cmpq %rax, %rdx
> > - jbe L(zero)
> > - addq $(VEC_SIZE * 3), %rax
> > - addq %rdi, %rax
> > ret
> >
> > .p2align 4
> > -L(zero):
> > - xorl %eax, %eax
> > - ret
> > -# endif
> > -
> > - .p2align 4
> > -L(first_vec_x0):
> > +L(last_vec_x3_return):
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - leaq (%rdi, %rax, 4), %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > # else
> > - addq %rdi, %rax
> > + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> > + leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
> > # endif
> > ret
> >
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +L(last_4x_vec_or_less_cmpeq):
> > + VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > + subq $-(VEC_SIZE * 4), %rdi
> > + /* Check first VEC regardless. */
> > + testl %eax, %eax
> > + jnz L(first_vec_x1_check)
> > +
> > + /* If remaining length <= CHAR_PER_VEC * 2. */
> > + addl $(CHAR_PER_VEC * 2), %edx
> > + jle L(last_2x_vec)
> > +
> > .p2align 4
> > -L(first_vec_x1):
> > +L(last_4x_vec):
> > + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > + testl %eax, %eax
> > + jnz L(last_vec_x2)
> > +
> > +
> > + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > + /* Create mask for possible matches within remaining length. */
> > +# ifdef USE_AS_WMEMCHR
> > + movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
> > + bzhil %edx, %ecx, %ecx
> > +# else
> > + movq $-1, %rcx
> > + bzhiq %rdx, %rcx, %rcx
> > +# endif
> > + /* Test matches in data against length match. */
> > + andl %ecx, %eax
> > + jnz L(last_vec_x3)
> > +
> > + /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
> > + remaining length was found to be > CHAR_PER_VEC * 2. */
> > + subl $CHAR_PER_VEC, %edx
> > + jbe L(zero_end2)
> > +
> > +
> > + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> > + kmovd %k0, %eax
> > + /* Shift remaining length mask for last VEC. */
> > +# ifdef USE_AS_WMEMCHR
> > + shrl $CHAR_PER_VEC, %ecx
> > +# else
> > + shrq $CHAR_PER_VEC, %rcx
> > +# endif
> > + andl %ecx, %eax
> > + jz L(zero_end2)
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - leaq VEC_SIZE(%rdi, %rax, 4), %rax
> > -# else
> > - addq $VEC_SIZE, %rax
> > - addq %rdi, %rax
> > -# endif
> > + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> > +L(zero_end2):
> > ret
> >
> > - .p2align 4
> > -L(first_vec_x2):
> > +L(last_vec_x2):
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
> > -# else
> > - addq $(VEC_SIZE * 2), %rax
> > - addq %rdi, %rax
> > -# endif
> > + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > ret
> >
> > .p2align 4
> > -L(4x_vec_end):
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x0)
> > - kmovd %k2, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x1)
> > - kmovd %k3, %eax
> > - testl %eax, %eax
> > - jnz L(first_vec_x2)
> > - kmovd %k4, %eax
> > - testl %eax, %eax
> > -L(first_vec_x3):
> > +L(last_vec_x3):
> > tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> > - leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
> > -# else
> > - addq $(VEC_SIZE * 3), %rax
> > - addq %rdi, %rax
> > -# endif
> > + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > ret
> > +# endif
> >
> > END (MEMCHR)
> > #endif
> > --
> > 2.29.2
> >
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH v3 2/3] x86: Optimize memchr-avx2.S
2021-05-03 8:44 ` [PATCH v1 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
2021-05-03 18:50 ` H.J. Lu
2021-05-03 20:06 ` [PATCH v2 " Noah Goldstein
@ 2021-05-03 22:58 ` Noah Goldstein
2021-05-03 22:58 ` [PATCH v3 3/3] x86: Optimize memchr-evex.S Noah Goldstein
2021-05-03 22:59 ` [PATCH v3 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
2 siblings, 2 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 22:58 UTC (permalink / raw)
To: libc-alpha
No bug. This commit optimizes memchr-avx2.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
asaving a few instructions the in loop return loop. test-memchr,
test-rawmemchr, and test-wmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
1 file changed, 247 insertions(+), 178 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 1fcb1c350f..0d8758e3e7 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -26,8 +26,22 @@
# ifdef USE_AS_WMEMCHR
# define VPCMPEQ vpcmpeqd
+# define VPBROADCAST vpbroadcastd
+# define CHAR_SIZE 4
# else
# define VPCMPEQ vpcmpeqb
+# define VPBROADCAST vpbroadcastb
+# define CHAR_SIZE 1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+# define ERAW_PTR_REG ecx
+# define RRAW_PTR_REG rcx
+# define ALGN_PTR_REG rdi
+# else
+# define ERAW_PTR_REG edi
+# define RRAW_PTR_REG rdi
+# define ALGN_PTR_REG rcx
# endif
# ifndef VZEROUPPER
@@ -39,6 +53,7 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
.section SECTION(.text),"ax",@progbits
ENTRY (MEMCHR)
@@ -47,295 +62,349 @@ ENTRY (MEMCHR)
test %RDX_LP, %RDX_LP
jz L(null)
# endif
- movl %edi, %ecx
- /* Broadcast CHAR to YMM0. */
- vmovd %esi, %xmm0
# ifdef USE_AS_WMEMCHR
shl $2, %RDX_LP
- vpbroadcastd %xmm0, %ymm0
# else
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
- vpbroadcastb %xmm0, %ymm0
# endif
+ /* Broadcast CHAR to YMMMATCH. */
+ vmovd %esi, %xmm0
+ VPBROADCAST %xmm0, %ymm0
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
+ VPCMPEQ (%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
# ifndef USE_AS_RAWMEMCHR
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rdx
- jbe L(zero)
-# else
- jnz L(first_vec_x0)
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $VEC_SIZE, %rdx
+ jbe L(first_vec_x0)
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
- addq %rcx, %rdx
+ .p2align 5
+L(first_vec_x0):
+ /* Check if first match was before length. */
+ tzcntl %eax, %eax
+ xorl %ecx, %ecx
+ cmpl %eax, %edx
+ leaq (%rdi, %rax), %rax
+ cmovle %rcx, %rax
+ VZEROUPPER_RETURN
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
+L(null):
+ xorl %eax, %eax
+ ret
# endif
- jmp L(more_4x_vec)
-
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
+L(cross_page_boundary):
+ /* Save pointer before aligning as its original value is necessary
+ for computer return address if byte is found or adjusting length
+ if it is not and this is memchr. */
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
+ rdi for rawmemchr. */
+ orq $(VEC_SIZE - 1), %ALGN_PTR_REG
+ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Calculate length until end of page (length checked for a
+ match). */
+ leaq 1(%ALGN_PTR_REG), %rsi
+ subq %RRAW_PTR_REG, %rsi
+# endif
/* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
- tzcntl %eax, %eax
+ sarxl %ERAW_PTR_REG, %eax, %eax
# ifndef USE_AS_RAWMEMCHR
/* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ cmpq %rsi, %rdx
+ jbe L(first_vec_x0)
# endif
- addq %rdi, %rax
- addq %rcx, %rax
+ testl %eax, %eax
+ jz L(cross_page_continue)
+ tzcntl %eax, %eax
+ addq %RRAW_PTR_REG, %rax
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
- overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ incq %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- /* Check the end of data. */
- subq %rcx, %rdx
- jbe L(zero)
-# endif
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
+
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 2 + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- addq $VEC_SIZE, %rdi
-# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 3 + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
-L(more_4x_vec):
+ .p2align 4
+L(aligned_more):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+# ifndef USE_AS_RAWMEMCHR
+L(cross_page_continue):
+ /* Align data to VEC_SIZE - 1. */
+ xorl %ecx, %ecx
+ subl %edi, %ecx
+ orq $(VEC_SIZE - 1), %rdi
+ /* esi is for adjusting length to see if near the end. */
+ leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# else
+ orq $(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+# endif
+ /* Load first VEC regardless. */
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. If near end handle specially. */
+ subq %rsi, %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
# ifndef USE_AS_RAWMEMCHR
+ /* Check if at last VEC_SIZE * 4 length. */
subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
+ jbe L(last_4x_vec_or_less_cmpeq)
+ /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+ length. */
+ incq %rdi
+ movl %edi, %ecx
+ orq $(VEC_SIZE * 4 - 1), %rdi
+ andl $(VEC_SIZE * 4 - 1), %ecx
addq %rcx, %rdx
+# else
+ /* Align data to VEC_SIZE * 4 - 1 for loop. */
+ incq %rdi
+ orq $(VEC_SIZE * 4 - 1), %rdi
# endif
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
-
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
vpor %ymm1, %ymm2, %ymm5
vpor %ymm3, %ymm4, %ymm6
vpor %ymm5, %ymm6, %ymm5
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
+ vpmovmskb %ymm5, %ecx
# ifdef USE_AS_RAWMEMCHR
- jmp L(loop_4x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
+ testl %ecx, %ecx
+ jz L(loop_4x_vec)
# else
- subq $(VEC_SIZE * 4), %rdx
- ja L(loop_4x_vec)
+ testl %ecx, %ecx
+ jnz L(loop_4x_vec_end)
-L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %edx
- jle L(last_2x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
+ subq $(VEC_SIZE * 4), %rdx
+ ja L(loop_4x_vec)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ /* Fall through into less than 4 remaining vectors of length case.
+ */
+ VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+ .p2align 4
+L(last_4x_vec_or_less):
+ /* Check if first VEC contained match. */
testl %eax, %eax
- jnz L(first_vec_x1)
+ jnz L(first_vec_x1_check)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
+ /* If remaining length > VEC_SIZE * 2. */
+ addl $(VEC_SIZE * 2), %edx
+ jg L(last_4x_vec)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+L(last_2x_vec):
+ /* If remaining length < VEC_SIZE. */
+ addl $VEC_SIZE, %edx
+ jle L(zero_end)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ /* Check VEC2 and compare any match with remaining length. */
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
- jnz L(first_vec_x3_check)
- xorl %eax, %eax
+ tzcntl %eax, %eax
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ addq $(VEC_SIZE + 1), %rdi
+ addq %rdi, %rax
+L(zero_end):
VZEROUPPER_RETURN
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %edx
- VPCMPEQ (%rdi), %ymm0, %ymm1
+L(loop_4x_vec_end):
+# endif
+ /* rawmemchr will fall through into this if match was found in
+ loop. */
+
vpmovmskb %ymm1, %eax
testl %eax, %eax
+ jnz L(last_vec_x1_return)
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
-
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
- jnz L(first_vec_x1_check)
- xorl %eax, %eax
- VZEROUPPER_RETURN
+ jnz L(last_vec_x2_return)
- .p2align 4
-L(first_vec_x0_check):
- tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ vpmovmskb %ymm3, %eax
+ /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
+ salq $32, %rcx
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 2 - 1), %rdi
+# else
+ subq $-(VEC_SIZE * 2 + 1), %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
.p2align 4
L(first_vec_x1_check):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $VEC_SIZE, %rax
+ /* Adjust length. */
+ subl $-(VEC_SIZE * 4), %edx
+ /* Check if match within remaining length. */
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ incq %rdi
addq %rdi, %rax
VZEROUPPER_RETURN
+ .p2align 4
+L(set_zero_end):
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
+# endif
.p2align 4
-L(first_vec_x2_check):
+L(last_vec_x1_return):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 2), %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 4 - 1), %rdi
+# else
+ incq %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 3), %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 3 - 1), %rdi
+# else
+ subq $-(VEC_SIZE + 1), %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
.p2align 4
-L(zero):
- xorl %eax, %eax
- jmp L(return_vzeroupper)
+L(last_4x_vec_or_less_cmpeq):
+ VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Check first VEC regardless. */
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+ /* If remaining length <= CHAR_PER_VEC * 2. */
+ addl $(VEC_SIZE * 2), %edx
+ jle L(last_2x_vec)
.p2align 4
-L(null):
- xorl %eax, %eax
- ret
-# endif
+L(last_4x_vec):
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2_return)
- .p2align 4
-L(first_vec_x0):
- tzcntl %eax, %eax
- addq %rdi, %rax
- VZEROUPPER_RETURN
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
- .p2align 4
-L(first_vec_x1):
- tzcntl %eax, %eax
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
- VZEROUPPER_RETURN
+ /* Create mask for possible matches within remaining length. */
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
- .p2align 4
-L(first_vec_x2):
+ /* Test matches in data against length match. */
+ andl %ecx, %eax
+ jnz L(last_vec_x3)
+
+ /* if remaining length <= VEC_SIZE * 3 (Note this is after
+ remaining length was found to be > VEC_SIZE * 2. */
+ subl $VEC_SIZE, %edx
+ jbe L(zero_end2)
+
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Shift remaining length mask for last VEC. */
+ shrq $32, %rcx
+ andl %ecx, %eax
+ jz L(zero_end2)
tzcntl %eax, %eax
- addq $(VEC_SIZE * 2), %rax
+ addq $(VEC_SIZE * 3 + 1), %rdi
addq %rdi, %rax
+L(zero_end2):
VZEROUPPER_RETURN
.p2align 4
-L(4x_vec_end):
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- vpmovmskb %ymm2, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- vpmovmskb %ymm4, %eax
- testl %eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
tzcntl %eax, %eax
- addq $(VEC_SIZE * 3), %rax
+ subq $-(VEC_SIZE * 2 + 1), %rdi
addq %rdi, %rax
VZEROUPPER_RETURN
+# endif
END (MEMCHR)
#endif
--
2.29.2
^ permalink raw reply [flat|nested] 20+ messages in thread
* [PATCH v3 3/3] x86: Optimize memchr-evex.S
2021-05-03 22:58 ` [PATCH v3 " Noah Goldstein
@ 2021-05-03 22:58 ` Noah Goldstein
2021-05-03 22:59 ` Noah Goldstein
2021-05-03 22:59 ` [PATCH v3 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
1 sibling, 1 reply; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 22:58 UTC (permalink / raw)
To: libc-alpha
No bug. This commit optimizes memchr-evex.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
saving some ALU in the alignment process, and most importantly
increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
test-wmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
1 file changed, 322 insertions(+), 225 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 6dd5d67b90..81d5cd6486 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -26,14 +26,28 @@
# ifdef USE_AS_WMEMCHR
# define VPBROADCAST vpbroadcastd
-# define VPCMP vpcmpd
-# define SHIFT_REG r8d
+# define VPMINU vpminud
+# define VPCMP vpcmpd
+# define VPCMPEQ vpcmpeqd
+# define CHAR_SIZE 4
# else
# define VPBROADCAST vpbroadcastb
-# define VPCMP vpcmpb
-# define SHIFT_REG ecx
+# define VPMINU vpminub
+# define VPCMP vpcmpb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_SIZE 1
# endif
+# ifdef USE_AS_RAWMEMCHR
+# define RAW_PTR_REG rcx
+# define ALGN_PTR_REG rdi
+# else
+# define RAW_PTR_REG rdi
+# define ALGN_PTR_REG rcx
+# endif
+
+# define XMMZERO xmm23
+# define YMMZERO ymm23
# define XMMMATCH xmm16
# define YMMMATCH ymm16
# define YMM1 ymm17
@@ -44,6 +58,8 @@
# define YMM6 ymm22
# define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE 4096
.section .text.evex,"ax",@progbits
ENTRY (MEMCHR)
@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
/* Check for zero length. */
test %RDX_LP, %RDX_LP
jz L(zero)
-# endif
- movl %edi, %ecx
-# ifdef USE_AS_WMEMCHR
- shl $2, %RDX_LP
-# else
+
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
/* Broadcast CHAR to YMMMATCH. */
VPBROADCAST %esi, %YMMMATCH
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
-
+ VPCMP $0, (%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
# ifndef USE_AS_RAWMEMCHR
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rdx
- jbe L(zero)
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $CHAR_PER_VEC, %rdx
+ jbe L(first_vec_x0)
+# endif
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
- jnz L(first_vec_x0)
+ addq %rdi, %rax
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ ret
# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
- addq %rcx, %rdx
-
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
- jmp L(more_4x_vec)
+L(zero):
+ xorl %eax, %eax
+ ret
+ .p2align 5
+L(first_vec_x0):
+ /* Check if first match was before length. */
+ tzcntl %eax, %eax
+ xorl %ecx, %ecx
+ cmpl %eax, %edx
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+ cmovle %rcx, %rax
+ ret
+# else
+ /* NB: first_vec_x0 is 17 bytes which will leave
+ cross_page_boundary (which is relatively cold) close enough
+ to ideal alignment. So only realign L(cross_page_boundary) if
+ rawmemchr. */
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
+# endif
+L(cross_page_boundary):
+ /* Save pointer before aligning as its original value is
+ necessary for computer return address if byte is found or
+ adjusting length if it is not and this is memchr. */
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
+ for rawmemchr. */
+ andq $-VEC_SIZE, %ALGN_PTR_REG
+ VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+ kmovd %k0, %r8d
# ifdef USE_AS_WMEMCHR
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
bytes. */
- movl %ecx, %SHIFT_REG
- sarl $2, %SHIFT_REG
+ sarl $2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+ movl $(PAGE_SIZE / CHAR_SIZE), %esi
+ subl %eax, %esi
# endif
- andq $-VEC_SIZE, %rdi
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- /* Remove the leading bytes. */
- sarxl %SHIFT_REG, %eax, %eax
- testl %eax, %eax
- jz L(aligned_more)
- tzcntl %eax, %eax
# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+ andl $(CHAR_PER_VEC - 1), %eax
# endif
+ /* Remove the leading bytes. */
+ sarxl %eax, %r8d, %eax
# ifndef USE_AS_RAWMEMCHR
/* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ cmpq %rsi, %rdx
+ jbe L(first_vec_x0)
+# endif
+ testl %eax, %eax
+ jz L(cross_page_continue)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+# else
+ addq %RAW_PTR_REG, %rax
# endif
- addq %rdi, %rax
- addq %rcx, %rax
ret
.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
- overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- /* Check the end of data. */
- subq %rcx, %rdx
- jbe L(zero)
-# endif
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- addq $VEC_SIZE, %rdi
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
-# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
-L(more_4x_vec):
+ .p2align 5
+L(aligned_more):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Align data to VEC_SIZE. */
+L(cross_page_continue):
+ xorl %ecx, %ecx
+ subl %edi, %ecx
+ andq $-VEC_SIZE, %rdi
+ /* esi is for adjusting length to see if near the end. */
+ leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %esi
+# endif
+# else
+ andq $-VEC_SIZE, %rdi
+L(cross_page_continue):
+# endif
+ /* Load first VEC regardless. */
+ VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. If near end handle specially. */
+ subq %rsi, %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
+
# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
+ /* Check if at last CHAR_PER_VEC * 4 length. */
+ subq $(CHAR_PER_VEC * 4), %rdx
+ jbe L(last_4x_vec_or_less_cmpeq)
+ addq $VEC_SIZE, %rdi
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
+ /* Align data to VEC_SIZE * 4 for the loop and readjust length.
+ */
+# ifdef USE_AS_WMEMCHR
+ movl %edi, %ecx
andq $-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
+ andl $(VEC_SIZE * 4 - 1), %ecx
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
addq %rcx, %rdx
+# else
+ addq %rdi, %rdx
+ andq $-(4 * VEC_SIZE), %rdi
+ subq %rdi, %rdx
+# endif
+# else
+ addq $VEC_SIZE, %rdi
+ andq $-(4 * VEC_SIZE), %rdi
# endif
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
+
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
- kord %k1, %k2, %k5
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
- kord %k3, %k4, %k6
- kortestd %k5, %k6
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
+ /* It would be possible to save some instructions using 4x VPCMP
+ but bottleneck on port 5 makes it not woth it. */
+ VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+ /* xor will set bytes match esi to zero. */
+ vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+ vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+ VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+ /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
+ VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
+ VPCMP $0, %YMM3, %YMMZERO, %k2
# ifdef USE_AS_RAWMEMCHR
- jmp L(loop_4x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
+ kortestd %k2, %k3
+ jz L(loop_4x_vec)
# else
- subq $(VEC_SIZE * 4), %rdx
+ kortestd %k2, %k3
+ jnz L(loop_4x_vec_end)
+
+ subq $-(VEC_SIZE * 4), %rdi
+
+ subq $(CHAR_PER_VEC * 4), %rdx
ja L(loop_4x_vec)
+ /* Fall through into less than 4 remaining vectors of length case.
+ */
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ addq $(VEC_SIZE * 3), %rdi
+ .p2align 4
L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %edx
- jle L(last_2x_vec)
-
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ /* Check if first VEC contained match. */
testl %eax, %eax
- jnz L(first_vec_x0)
+ jnz L(first_vec_x1_check)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
+ /* If remaining length > CHAR_PER_VEC * 2. */
+ addl $(CHAR_PER_VEC * 2), %edx
+ jg L(last_4x_vec)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+L(last_2x_vec):
+ /* If remaining length < CHAR_PER_VEC. */
+ addl $CHAR_PER_VEC, %edx
+ jle L(zero_end)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+ /* Check VEC2 and compare any match with remaining length. */
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ tzcntl %eax, %eax
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end):
+ ret
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x3_check)
+ .p2align 4
+L(first_vec_x1_check):
+ tzcntl %eax, %eax
+ /* Adjust length. */
+ subl $-(CHAR_PER_VEC * 4), %edx
+ /* Check if match within remaining length. */
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+L(set_zero_end):
xorl %eax, %eax
ret
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %edx
- VPCMP $0, (%rdi), %YMMMATCH, %k1
+L(loop_4x_vec_end):
+# endif
+ /* rawmemchr will fall through into this if match was found in
+ loop. */
+
+ /* k1 has not of matches with VEC1. */
kmovd %k1, %eax
- testl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ subl $((1 << CHAR_PER_VEC) - 1), %eax
+# else
+ incl %eax
+# endif
+ jnz L(last_vec_x1_return)
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+ VPCMP $0, %YMM2, %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2_return)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ kmovd %k2, %eax
testl %eax, %eax
- jnz L(first_vec_x1_check)
- xorl %eax, %eax
- ret
+ jnz L(last_vec_x3_return)
- .p2align 4
-L(first_vec_x0_check):
+ kmovd %k3, %eax
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# ifdef USE_AS_RAWMEMCHR
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq %rdi, %rax
ret
.p2align 4
-L(first_vec_x1_check):
+L(last_vec_x1_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $VEC_SIZE, %rax
+# ifdef USE_AS_RAWMEMCHR
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
addq %rdi, %rax
- ret
-
- .p2align 4
-L(first_vec_x2_check):
- tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# endif
+# else
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
ret
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# ifdef USE_AS_RAWMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
ret
.p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-# endif
-
- .p2align 4
-L(first_vec_x0):
+L(last_vec_x3_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (%rdi, %rax, 4), %rax
+# ifdef USE_AS_RAWMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
# else
- addq %rdi, %rax
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
# endif
ret
+
+# ifndef USE_AS_RAWMEMCHR
+L(last_4x_vec_or_less_cmpeq):
+ VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Check first VEC regardless. */
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+
+ /* If remaining length <= CHAR_PER_VEC * 2. */
+ addl $(CHAR_PER_VEC * 2), %edx
+ jle L(last_2x_vec)
+
.p2align 4
-L(first_vec_x1):
+L(last_4x_vec):
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ /* Create mask for possible matches within remaining length. */
+# ifdef USE_AS_WMEMCHR
+ movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+ bzhil %edx, %ecx, %ecx
+# else
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
+# endif
+ /* Test matches in data against length match. */
+ andl %ecx, %eax
+ jnz L(last_vec_x3)
+
+ /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
+ remaining length was found to be > CHAR_PER_VEC * 2. */
+ subl $CHAR_PER_VEC, %edx
+ jbe L(zero_end2)
+
+
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ /* Shift remaining length mask for last VEC. */
+# ifdef USE_AS_WMEMCHR
+ shrl $CHAR_PER_VEC, %ecx
+# else
+ shrq $CHAR_PER_VEC, %rcx
+# endif
+ andl %ecx, %eax
+ jz L(zero_end2)
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end2):
ret
- .p2align 4
-L(first_vec_x2):
+L(last_vec_x2):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
-# else
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
ret
.p2align 4
-L(4x_vec_end):
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- kmovd %k2, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- kmovd %k3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- kmovd %k4, %eax
- testl %eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
-# else
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
ret
+# endif
END (MEMCHR)
#endif
--
2.29.2
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v3 2/3] x86: Optimize memchr-avx2.S
2021-05-03 22:58 ` [PATCH v3 " Noah Goldstein
2021-05-03 22:58 ` [PATCH v3 3/3] x86: Optimize memchr-evex.S Noah Goldstein
@ 2021-05-03 22:59 ` Noah Goldstein
1 sibling, 0 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 22:59 UTC (permalink / raw)
To: GNU C Library
On Mon, May 3, 2021 at 6:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug. This commit optimizes memchr-avx2.S. The optimizations include
> replacing some branches with cmovcc, avoiding some branches entirely
> in the less_4x_vec case, making the page cross logic less strict,
> asaving a few instructions the in loop return loop. test-memchr,
> test-rawmemchr, and test-wmemchr are all passing.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
> 1 file changed, 247 insertions(+), 178 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> index 1fcb1c350f..0d8758e3e7 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> @@ -26,8 +26,22 @@
>
> # ifdef USE_AS_WMEMCHR
> # define VPCMPEQ vpcmpeqd
> +# define VPBROADCAST vpbroadcastd
> +# define CHAR_SIZE 4
> # else
> # define VPCMPEQ vpcmpeqb
> +# define VPBROADCAST vpbroadcastb
> +# define CHAR_SIZE 1
> +# endif
> +
> +# ifdef USE_AS_RAWMEMCHR
> +# define ERAW_PTR_REG ecx
> +# define RRAW_PTR_REG rcx
> +# define ALGN_PTR_REG rdi
> +# else
> +# define ERAW_PTR_REG edi
> +# define RRAW_PTR_REG rdi
> +# define ALGN_PTR_REG rcx
> # endif
>
> # ifndef VZEROUPPER
> @@ -39,6 +53,7 @@
> # endif
>
> # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
>
> .section SECTION(.text),"ax",@progbits
> ENTRY (MEMCHR)
> @@ -47,295 +62,349 @@ ENTRY (MEMCHR)
> test %RDX_LP, %RDX_LP
> jz L(null)
> # endif
> - movl %edi, %ecx
> - /* Broadcast CHAR to YMM0. */
> - vmovd %esi, %xmm0
> # ifdef USE_AS_WMEMCHR
> shl $2, %RDX_LP
> - vpbroadcastd %xmm0, %ymm0
> # else
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> movl %edx, %edx
> # endif
> - vpbroadcastb %xmm0, %ymm0
> # endif
> + /* Broadcast CHAR to YMMMATCH. */
> + vmovd %esi, %xmm0
> + VPBROADCAST %xmm0, %ymm0
> /* Check if we may cross page boundary with one vector load. */
> - andl $(2 * VEC_SIZE - 1), %ecx
> - cmpl $VEC_SIZE, %ecx
> - ja L(cros_page_boundary)
> + movl %edi, %eax
> + andl $(PAGE_SIZE - 1), %eax
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> + ja L(cross_page_boundary)
>
> /* Check the first VEC_SIZE bytes. */
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> + VPCMPEQ (%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> -
> # ifndef USE_AS_RAWMEMCHR
> - jnz L(first_vec_x0_check)
> - /* Adjust length and check the end of data. */
> - subq $VEC_SIZE, %rdx
> - jbe L(zero)
> -# else
> - jnz L(first_vec_x0)
> + /* If length < CHAR_PER_VEC handle special. */
> + cmpq $VEC_SIZE, %rdx
> + jbe L(first_vec_x0)
> # endif
> -
> - /* Align data for aligned loads in the loop. */
> - addq $VEC_SIZE, %rdi
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> + testl %eax, %eax
> + jz L(aligned_more)
> + tzcntl %eax, %eax
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> # ifndef USE_AS_RAWMEMCHR
> - /* Adjust length. */
> - addq %rcx, %rdx
> + .p2align 5
> +L(first_vec_x0):
> + /* Check if first match was before length. */
> + tzcntl %eax, %eax
> + xorl %ecx, %ecx
> + cmpl %eax, %edx
> + leaq (%rdi, %rax), %rax
> + cmovle %rcx, %rax
> + VZEROUPPER_RETURN
>
> - subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> +L(null):
> + xorl %eax, %eax
> + ret
> # endif
> - jmp L(more_4x_vec)
> -
> .p2align 4
> -L(cros_page_boundary):
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> +L(cross_page_boundary):
> + /* Save pointer before aligning as its original value is necessary
> + for computer return address if byte is found or adjusting length
> + if it is not and this is memchr. */
> + movq %rdi, %rcx
> + /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
> + rdi for rawmemchr. */
> + orq $(VEC_SIZE - 1), %ALGN_PTR_REG
> + VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> + /* Calculate length until end of page (length checked for a
> + match). */
> + leaq 1(%ALGN_PTR_REG), %rsi
> + subq %RRAW_PTR_REG, %rsi
> +# endif
> /* Remove the leading bytes. */
> - sarl %cl, %eax
> - testl %eax, %eax
> - jz L(aligned_more)
> - tzcntl %eax, %eax
> + sarxl %ERAW_PTR_REG, %eax, %eax
> # ifndef USE_AS_RAWMEMCHR
> /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> + cmpq %rsi, %rdx
> + jbe L(first_vec_x0)
> # endif
> - addq %rdi, %rax
> - addq %rcx, %rax
> + testl %eax, %eax
> + jz L(cross_page_continue)
> + tzcntl %eax, %eax
> + addq %RRAW_PTR_REG, %rax
> L(return_vzeroupper):
> ZERO_UPPER_VEC_REGISTERS_RETURN
>
> .p2align 4
> -L(aligned_more):
> -# ifndef USE_AS_RAWMEMCHR
> - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> - overflow. */
> - negq %rcx
> - addq $VEC_SIZE, %rcx
> +L(first_vec_x1):
> + tzcntl %eax, %eax
> + incq %rdi
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> - /* Check the end of data. */
> - subq %rcx, %rdx
> - jbe L(zero)
> -# endif
> + .p2align 4
> +L(first_vec_x2):
> + tzcntl %eax, %eax
> + addq $(VEC_SIZE + 1), %rdi
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
> +
> + .p2align 4
> +L(first_vec_x3):
> + tzcntl %eax, %eax
> + addq $(VEC_SIZE * 2 + 1), %rdi
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> - addq $VEC_SIZE, %rdi
>
> -# ifndef USE_AS_RAWMEMCHR
> - subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> -# endif
> + .p2align 4
> +L(first_vec_x4):
> + tzcntl %eax, %eax
> + addq $(VEC_SIZE * 3 + 1), %rdi
> + addq %rdi, %rax
> + VZEROUPPER_RETURN
>
> -L(more_4x_vec):
> + .p2align 4
> +L(aligned_more):
> /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> since data is only aligned to VEC_SIZE. */
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
>
> - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> +# ifndef USE_AS_RAWMEMCHR
> +L(cross_page_continue):
> + /* Align data to VEC_SIZE - 1. */
> + xorl %ecx, %ecx
> + subl %edi, %ecx
> + orq $(VEC_SIZE - 1), %rdi
> + /* esi is for adjusting length to see if near the end. */
> + leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> +# else
> + orq $(VEC_SIZE - 1), %rdi
> +L(cross_page_continue):
> +# endif
> + /* Load first VEC regardless. */
> + VPCMPEQ 1(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> + /* Adjust length. If near end handle specially. */
> + subq %rsi, %rdx
> + jbe L(last_4x_vec_or_less)
> +# endif
> testl %eax, %eax
> jnz L(first_vec_x1)
>
> - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x2)
>
> - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> jnz L(first_vec_x3)
>
> - addq $(VEC_SIZE * 4), %rdi
> + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
> + testl %eax, %eax
> + jnz L(first_vec_x4)
>
> # ifndef USE_AS_RAWMEMCHR
> + /* Check if at last VEC_SIZE * 4 length. */
> subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> -# endif
> -
> - /* Align data to 4 * VEC_SIZE. */
> - movq %rdi, %rcx
> - andl $(4 * VEC_SIZE - 1), %ecx
> - andq $-(4 * VEC_SIZE), %rdi
> -
> -# ifndef USE_AS_RAWMEMCHR
> - /* Adjust length. */
> + jbe L(last_4x_vec_or_less_cmpeq)
> + /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
> + length. */
> + incq %rdi
> + movl %edi, %ecx
> + orq $(VEC_SIZE * 4 - 1), %rdi
> + andl $(VEC_SIZE * 4 - 1), %ecx
> addq %rcx, %rdx
> +# else
> + /* Align data to VEC_SIZE * 4 - 1 for loop. */
> + incq %rdi
> + orq $(VEC_SIZE * 4 - 1), %rdi
> # endif
>
> + /* Compare 4 * VEC at a time forward. */
> .p2align 4
> L(loop_4x_vec):
> - /* Compare 4 * VEC at a time forward. */
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
> - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
> - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
> -
> + VPCMPEQ 1(%rdi), %ymm0, %ymm1
> + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
> + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
> + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
> vpor %ymm1, %ymm2, %ymm5
> vpor %ymm3, %ymm4, %ymm6
> vpor %ymm5, %ymm6, %ymm5
>
> - vpmovmskb %ymm5, %eax
> - testl %eax, %eax
> - jnz L(4x_vec_end)
> -
> - addq $(VEC_SIZE * 4), %rdi
> -
> + vpmovmskb %ymm5, %ecx
> # ifdef USE_AS_RAWMEMCHR
> - jmp L(loop_4x_vec)
> + subq $-(VEC_SIZE * 4), %rdi
> + testl %ecx, %ecx
> + jz L(loop_4x_vec)
> # else
> - subq $(VEC_SIZE * 4), %rdx
> - ja L(loop_4x_vec)
> + testl %ecx, %ecx
> + jnz L(loop_4x_vec_end)
>
> -L(last_4x_vec_or_less):
> - /* Less than 4 * VEC and aligned to VEC_SIZE. */
> - addl $(VEC_SIZE * 2), %edx
> - jle L(last_2x_vec)
> + subq $-(VEC_SIZE * 4), %rdi
>
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
> + subq $(VEC_SIZE * 4), %rdx
> + ja L(loop_4x_vec)
>
> - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> + /* Fall through into less than 4 remaining vectors of length case.
> + */
> + VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> + .p2align 4
> +L(last_4x_vec_or_less):
> + /* Check if first VEC contained match. */
> testl %eax, %eax
> - jnz L(first_vec_x1)
> + jnz L(first_vec_x1_check)
>
> - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> + /* If remaining length > VEC_SIZE * 2. */
> + addl $(VEC_SIZE * 2), %edx
> + jg L(last_4x_vec)
>
> - jnz L(first_vec_x2_check)
> - subl $VEC_SIZE, %edx
> - jle L(zero)
> +L(last_2x_vec):
> + /* If remaining length < VEC_SIZE. */
> + addl $VEC_SIZE, %edx
> + jle L(zero_end)
>
> - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> + /* Check VEC2 and compare any match with remaining length. */
> + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> -
> - jnz L(first_vec_x3_check)
> - xorl %eax, %eax
> + tzcntl %eax, %eax
> + cmpl %eax, %edx
> + jbe L(set_zero_end)
> + addq $(VEC_SIZE + 1), %rdi
> + addq %rdi, %rax
> +L(zero_end):
> VZEROUPPER_RETURN
>
> .p2align 4
> -L(last_2x_vec):
> - addl $(VEC_SIZE * 2), %edx
> - VPCMPEQ (%rdi), %ymm0, %ymm1
> +L(loop_4x_vec_end):
> +# endif
> + /* rawmemchr will fall through into this if match was found in
> + loop. */
> +
> vpmovmskb %ymm1, %eax
> testl %eax, %eax
> + jnz L(last_vec_x1_return)
>
> - jnz L(first_vec_x0_check)
> - subl $VEC_SIZE, %edx
> - jle L(zero)
> -
> - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> - vpmovmskb %ymm1, %eax
> + vpmovmskb %ymm2, %eax
> testl %eax, %eax
> - jnz L(first_vec_x1_check)
> - xorl %eax, %eax
> - VZEROUPPER_RETURN
> + jnz L(last_vec_x2_return)
>
> - .p2align 4
> -L(first_vec_x0_check):
> - tzcntl %eax, %eax
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> + vpmovmskb %ymm3, %eax
> + /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
> + salq $32, %rcx
> + orq %rcx, %rax
> + tzcntq %rax, %rax
> +# ifdef USE_AS_RAWMEMCHR
> + subq $(VEC_SIZE * 2 - 1), %rdi
> +# else
> + subq $-(VEC_SIZE * 2 + 1), %rdi
> +# endif
> addq %rdi, %rax
> VZEROUPPER_RETURN
> +# ifndef USE_AS_RAWMEMCHR
>
> .p2align 4
> L(first_vec_x1_check):
> tzcntl %eax, %eax
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $VEC_SIZE, %rax
> + /* Adjust length. */
> + subl $-(VEC_SIZE * 4), %edx
> + /* Check if match within remaining length. */
> + cmpl %eax, %edx
> + jbe L(set_zero_end)
> + incq %rdi
> addq %rdi, %rax
> VZEROUPPER_RETURN
> + .p2align 4
> +L(set_zero_end):
> + xorl %eax, %eax
> + VZEROUPPER_RETURN
> +# endif
>
> .p2align 4
> -L(first_vec_x2_check):
> +L(last_vec_x1_return):
> tzcntl %eax, %eax
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $(VEC_SIZE * 2), %rax
> +# ifdef USE_AS_RAWMEMCHR
> + subq $(VEC_SIZE * 4 - 1), %rdi
> +# else
> + incq %rdi
> +# endif
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> .p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x2_return):
> tzcntl %eax, %eax
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $(VEC_SIZE * 3), %rax
> +# ifdef USE_AS_RAWMEMCHR
> + subq $(VEC_SIZE * 3 - 1), %rdi
> +# else
> + subq $-(VEC_SIZE + 1), %rdi
> +# endif
> addq %rdi, %rax
> VZEROUPPER_RETURN
>
> +# ifndef USE_AS_RAWMEMCHR
> .p2align 4
> -L(zero):
> - xorl %eax, %eax
> - jmp L(return_vzeroupper)
> +L(last_4x_vec_or_less_cmpeq):
> + VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
> + subq $-(VEC_SIZE * 4), %rdi
> + /* Check first VEC regardless. */
> + testl %eax, %eax
> + jnz L(first_vec_x1_check)
>
> + /* If remaining length <= CHAR_PER_VEC * 2. */
> + addl $(VEC_SIZE * 2), %edx
> + jle L(last_2x_vec)
> .p2align 4
> -L(null):
> - xorl %eax, %eax
> - ret
> -# endif
> +L(last_4x_vec):
> + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
> + testl %eax, %eax
> + jnz L(last_vec_x2_return)
>
> - .p2align 4
> -L(first_vec_x0):
> - tzcntl %eax, %eax
> - addq %rdi, %rax
> - VZEROUPPER_RETURN
> + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
>
> - .p2align 4
> -L(first_vec_x1):
> - tzcntl %eax, %eax
> - addq $VEC_SIZE, %rax
> - addq %rdi, %rax
> - VZEROUPPER_RETURN
> + /* Create mask for possible matches within remaining length. */
> + movq $-1, %rcx
> + bzhiq %rdx, %rcx, %rcx
>
> - .p2align 4
> -L(first_vec_x2):
> + /* Test matches in data against length match. */
> + andl %ecx, %eax
> + jnz L(last_vec_x3)
> +
> + /* if remaining length <= VEC_SIZE * 3 (Note this is after
> + remaining length was found to be > VEC_SIZE * 2. */
> + subl $VEC_SIZE, %edx
> + jbe L(zero_end2)
> +
> + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> + vpmovmskb %ymm1, %eax
> + /* Shift remaining length mask for last VEC. */
> + shrq $32, %rcx
> + andl %ecx, %eax
> + jz L(zero_end2)
> tzcntl %eax, %eax
> - addq $(VEC_SIZE * 2), %rax
> + addq $(VEC_SIZE * 3 + 1), %rdi
> addq %rdi, %rax
> +L(zero_end2):
> VZEROUPPER_RETURN
>
> .p2align 4
> -L(4x_vec_end):
> - vpmovmskb %ymm1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
> - vpmovmskb %ymm2, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x1)
> - vpmovmskb %ymm3, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x2)
> - vpmovmskb %ymm4, %eax
> - testl %eax, %eax
> -L(first_vec_x3):
> +L(last_vec_x3):
> tzcntl %eax, %eax
> - addq $(VEC_SIZE * 3), %rax
> + subq $-(VEC_SIZE * 2 + 1), %rdi
> addq %rdi, %rax
> VZEROUPPER_RETURN
> +# endif
>
> END (MEMCHR)
> #endif
> --
> 2.29.2
>
This is the patch I will push unless anyone has objections.
^ permalink raw reply [flat|nested] 20+ messages in thread
* Re: [PATCH v3 3/3] x86: Optimize memchr-evex.S
2021-05-03 22:58 ` [PATCH v3 3/3] x86: Optimize memchr-evex.S Noah Goldstein
@ 2021-05-03 22:59 ` Noah Goldstein
0 siblings, 0 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 22:59 UTC (permalink / raw)
To: GNU C Library
On Mon, May 3, 2021 at 6:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug. This commit optimizes memchr-evex.S. The optimizations include
> replacing some branches with cmovcc, avoiding some branches entirely
> in the less_4x_vec case, making the page cross logic less strict,
> saving some ALU in the alignment process, and most importantly
> increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
> test-wmemchr are all passing.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
> 1 file changed, 322 insertions(+), 225 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> index 6dd5d67b90..81d5cd6486 100644
> --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> @@ -26,14 +26,28 @@
>
> # ifdef USE_AS_WMEMCHR
> # define VPBROADCAST vpbroadcastd
> -# define VPCMP vpcmpd
> -# define SHIFT_REG r8d
> +# define VPMINU vpminud
> +# define VPCMP vpcmpd
> +# define VPCMPEQ vpcmpeqd
> +# define CHAR_SIZE 4
> # else
> # define VPBROADCAST vpbroadcastb
> -# define VPCMP vpcmpb
> -# define SHIFT_REG ecx
> +# define VPMINU vpminub
> +# define VPCMP vpcmpb
> +# define VPCMPEQ vpcmpeqb
> +# define CHAR_SIZE 1
> # endif
>
> +# ifdef USE_AS_RAWMEMCHR
> +# define RAW_PTR_REG rcx
> +# define ALGN_PTR_REG rdi
> +# else
> +# define RAW_PTR_REG rdi
> +# define ALGN_PTR_REG rcx
> +# endif
> +
> +# define XMMZERO xmm23
> +# define YMMZERO ymm23
> # define XMMMATCH xmm16
> # define YMMMATCH ymm16
> # define YMM1 ymm17
> @@ -44,6 +58,8 @@
> # define YMM6 ymm22
>
> # define VEC_SIZE 32
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> +# define PAGE_SIZE 4096
>
> .section .text.evex,"ax",@progbits
> ENTRY (MEMCHR)
> @@ -51,11 +67,7 @@ ENTRY (MEMCHR)
> /* Check for zero length. */
> test %RDX_LP, %RDX_LP
> jz L(zero)
> -# endif
> - movl %edi, %ecx
> -# ifdef USE_AS_WMEMCHR
> - shl $2, %RDX_LP
> -# else
> +
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> movl %edx, %edx
> @@ -64,318 +76,403 @@ ENTRY (MEMCHR)
> /* Broadcast CHAR to YMMMATCH. */
> VPBROADCAST %esi, %YMMMATCH
> /* Check if we may cross page boundary with one vector load. */
> - andl $(2 * VEC_SIZE - 1), %ecx
> - cmpl $VEC_SIZE, %ecx
> - ja L(cros_page_boundary)
> + movl %edi, %eax
> + andl $(PAGE_SIZE - 1), %eax
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> + ja L(cross_page_boundary)
>
> /* Check the first VEC_SIZE bytes. */
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> -
> + VPCMP $0, (%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> # ifndef USE_AS_RAWMEMCHR
> - jnz L(first_vec_x0_check)
> - /* Adjust length and check the end of data. */
> - subq $VEC_SIZE, %rdx
> - jbe L(zero)
> + /* If length < CHAR_PER_VEC handle special. */
> + cmpq $CHAR_PER_VEC, %rdx
> + jbe L(first_vec_x0)
> +# endif
> + testl %eax, %eax
> + jz L(aligned_more)
> + tzcntl %eax, %eax
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> # else
> - jnz L(first_vec_x0)
> + addq %rdi, %rax
> # endif
> -
> - /* Align data for aligned loads in the loop. */
> - addq $VEC_SIZE, %rdi
> - andl $(VEC_SIZE - 1), %ecx
> - andq $-VEC_SIZE, %rdi
> + ret
>
> # ifndef USE_AS_RAWMEMCHR
> - /* Adjust length. */
> - addq %rcx, %rdx
> -
> - subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> -# endif
> - jmp L(more_4x_vec)
> +L(zero):
> + xorl %eax, %eax
> + ret
>
> + .p2align 5
> +L(first_vec_x0):
> + /* Check if first match was before length. */
> + tzcntl %eax, %eax
> + xorl %ecx, %ecx
> + cmpl %eax, %edx
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> + cmovle %rcx, %rax
> + ret
> +# else
> + /* NB: first_vec_x0 is 17 bytes which will leave
> + cross_page_boundary (which is relatively cold) close enough
> + to ideal alignment. So only realign L(cross_page_boundary) if
> + rawmemchr. */
> .p2align 4
> -L(cros_page_boundary):
> - andl $(VEC_SIZE - 1), %ecx
> +# endif
> +L(cross_page_boundary):
> + /* Save pointer before aligning as its original value is
> + necessary for computer return address if byte is found or
> + adjusting length if it is not and this is memchr. */
> + movq %rdi, %rcx
> + /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
> + for rawmemchr. */
> + andq $-VEC_SIZE, %ALGN_PTR_REG
> + VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
> + kmovd %k0, %r8d
> # ifdef USE_AS_WMEMCHR
> - /* NB: Divide shift count by 4 since each bit in K1 represent 4
> + /* NB: Divide shift count by 4 since each bit in K0 represent 4
> bytes. */
> - movl %ecx, %SHIFT_REG
> - sarl $2, %SHIFT_REG
> + sarl $2, %eax
> +# endif
> +# ifndef USE_AS_RAWMEMCHR
> + movl $(PAGE_SIZE / CHAR_SIZE), %esi
> + subl %eax, %esi
> # endif
> - andq $-VEC_SIZE, %rdi
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - /* Remove the leading bytes. */
> - sarxl %SHIFT_REG, %eax, %eax
> - testl %eax, %eax
> - jz L(aligned_more)
> - tzcntl %eax, %eax
> # ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - sall $2, %eax
> + andl $(CHAR_PER_VEC - 1), %eax
> # endif
> + /* Remove the leading bytes. */
> + sarxl %eax, %r8d, %eax
> # ifndef USE_AS_RAWMEMCHR
> /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> + cmpq %rsi, %rdx
> + jbe L(first_vec_x0)
> +# endif
> + testl %eax, %eax
> + jz L(cross_page_continue)
> + tzcntl %eax, %eax
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
> +# else
> + addq %RAW_PTR_REG, %rax
> # endif
> - addq %rdi, %rax
> - addq %rcx, %rax
> ret
>
> .p2align 4
> -L(aligned_more):
> -# ifndef USE_AS_RAWMEMCHR
> - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> - overflow. */
> - negq %rcx
> - addq $VEC_SIZE, %rcx
> +L(first_vec_x1):
> + tzcntl %eax, %eax
> + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> - /* Check the end of data. */
> - subq %rcx, %rdx
> - jbe L(zero)
> -# endif
> + .p2align 4
> +L(first_vec_x2):
> + tzcntl %eax, %eax
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> - addq $VEC_SIZE, %rdi
> + .p2align 4
> +L(first_vec_x3):
> + tzcntl %eax, %eax
> + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> -# ifndef USE_AS_RAWMEMCHR
> - subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> -# endif
> + .p2align 4
> +L(first_vec_x4):
> + tzcntl %eax, %eax
> + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> + ret
>
> -L(more_4x_vec):
> + .p2align 5
> +L(aligned_more):
> /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
> since data is only aligned to VEC_SIZE. */
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
>
> - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> + /* Align data to VEC_SIZE. */
> +L(cross_page_continue):
> + xorl %ecx, %ecx
> + subl %edi, %ecx
> + andq $-VEC_SIZE, %rdi
> + /* esi is for adjusting length to see if near the end. */
> + leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Divide bytes by 4 to get the wchar_t count. */
> + sarl $2, %esi
> +# endif
> +# else
> + andq $-VEC_SIZE, %rdi
> +L(cross_page_continue):
> +# endif
> + /* Load first VEC regardless. */
> + VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> +# ifndef USE_AS_RAWMEMCHR
> + /* Adjust length. If near end handle specially. */
> + subq %rsi, %rdx
> + jbe L(last_4x_vec_or_less)
> +# endif
> testl %eax, %eax
> jnz L(first_vec_x1)
>
> - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> testl %eax, %eax
> jnz L(first_vec_x2)
>
> - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> testl %eax, %eax
> jnz L(first_vec_x3)
>
> - addq $(VEC_SIZE * 4), %rdi
> + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + testl %eax, %eax
> + jnz L(first_vec_x4)
> +
>
> # ifndef USE_AS_RAWMEMCHR
> - subq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec_or_less)
> -# endif
> + /* Check if at last CHAR_PER_VEC * 4 length. */
> + subq $(CHAR_PER_VEC * 4), %rdx
> + jbe L(last_4x_vec_or_less_cmpeq)
> + addq $VEC_SIZE, %rdi
>
> - /* Align data to 4 * VEC_SIZE. */
> - movq %rdi, %rcx
> - andl $(4 * VEC_SIZE - 1), %ecx
> + /* Align data to VEC_SIZE * 4 for the loop and readjust length.
> + */
> +# ifdef USE_AS_WMEMCHR
> + movl %edi, %ecx
> andq $-(4 * VEC_SIZE), %rdi
> -
> -# ifndef USE_AS_RAWMEMCHR
> - /* Adjust length. */
> + andl $(VEC_SIZE * 4 - 1), %ecx
> + /* NB: Divide bytes by 4 to get the wchar_t count. */
> + sarl $2, %ecx
> addq %rcx, %rdx
> +# else
> + addq %rdi, %rdx
> + andq $-(4 * VEC_SIZE), %rdi
> + subq %rdi, %rdx
> +# endif
> +# else
> + addq $VEC_SIZE, %rdi
> + andq $-(4 * VEC_SIZE), %rdi
> # endif
>
> + vpxorq %XMMZERO, %XMMZERO, %XMMZERO
> +
> + /* Compare 4 * VEC at a time forward. */
> .p2align 4
> L(loop_4x_vec):
> - /* Compare 4 * VEC at a time forward. */
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> - kord %k1, %k2, %k5
> - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> -
> - kord %k3, %k4, %k6
> - kortestd %k5, %k6
> - jnz L(4x_vec_end)
> -
> - addq $(VEC_SIZE * 4), %rdi
> -
> + /* It would be possible to save some instructions using 4x VPCMP
> + but bottleneck on port 5 makes it not woth it. */
> + VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
> + /* xor will set bytes match esi to zero. */
> + vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
> + vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
> + VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
> + /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
> + VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
> + VPCMP $0, %YMM3, %YMMZERO, %k2
> # ifdef USE_AS_RAWMEMCHR
> - jmp L(loop_4x_vec)
> + subq $-(VEC_SIZE * 4), %rdi
> + kortestd %k2, %k3
> + jz L(loop_4x_vec)
> # else
> - subq $(VEC_SIZE * 4), %rdx
> + kortestd %k2, %k3
> + jnz L(loop_4x_vec_end)
> +
> + subq $-(VEC_SIZE * 4), %rdi
> +
> + subq $(CHAR_PER_VEC * 4), %rdx
> ja L(loop_4x_vec)
>
> + /* Fall through into less than 4 remaining vectors of length case.
> + */
> + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + addq $(VEC_SIZE * 3), %rdi
> + .p2align 4
> L(last_4x_vec_or_less):
> - /* Less than 4 * VEC and aligned to VEC_SIZE. */
> - addl $(VEC_SIZE * 2), %edx
> - jle L(last_2x_vec)
> -
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> + /* Check if first VEC contained match. */
> testl %eax, %eax
> - jnz L(first_vec_x0)
> + jnz L(first_vec_x1_check)
>
> - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x1)
> + /* If remaining length > CHAR_PER_VEC * 2. */
> + addl $(CHAR_PER_VEC * 2), %edx
> + jg L(last_4x_vec)
>
> - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> +L(last_2x_vec):
> + /* If remaining length < CHAR_PER_VEC. */
> + addl $CHAR_PER_VEC, %edx
> + jle L(zero_end)
>
> - jnz L(first_vec_x2_check)
> - subl $VEC_SIZE, %edx
> - jle L(zero)
> + /* Check VEC2 and compare any match with remaining length. */
> + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + tzcntl %eax, %eax
> + cmpl %eax, %edx
> + jbe L(set_zero_end)
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +L(zero_end):
> + ret
>
> - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
>
> - jnz L(first_vec_x3_check)
> + .p2align 4
> +L(first_vec_x1_check):
> + tzcntl %eax, %eax
> + /* Adjust length. */
> + subl $-(CHAR_PER_VEC * 4), %edx
> + /* Check if match within remaining length. */
> + cmpl %eax, %edx
> + jbe L(set_zero_end)
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> + ret
> +L(set_zero_end):
> xorl %eax, %eax
> ret
>
> .p2align 4
> -L(last_2x_vec):
> - addl $(VEC_SIZE * 2), %edx
> - VPCMP $0, (%rdi), %YMMMATCH, %k1
> +L(loop_4x_vec_end):
> +# endif
> + /* rawmemchr will fall through into this if match was found in
> + loop. */
> +
> + /* k1 has not of matches with VEC1. */
> kmovd %k1, %eax
> - testl %eax, %eax
> +# ifdef USE_AS_WMEMCHR
> + subl $((1 << CHAR_PER_VEC) - 1), %eax
> +# else
> + incl %eax
> +# endif
> + jnz L(last_vec_x1_return)
>
> - jnz L(first_vec_x0_check)
> - subl $VEC_SIZE, %edx
> - jle L(zero)
> + VPCMP $0, %YMM2, %YMMZERO, %k0
> + kmovd %k0, %eax
> + testl %eax, %eax
> + jnz L(last_vec_x2_return)
>
> - VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> - kmovd %k1, %eax
> + kmovd %k2, %eax
> testl %eax, %eax
> - jnz L(first_vec_x1_check)
> - xorl %eax, %eax
> - ret
> + jnz L(last_vec_x3_return)
>
> - .p2align 4
> -L(first_vec_x0_check):
> + kmovd %k3, %eax
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - sall $2, %eax
> +# ifdef USE_AS_RAWMEMCHR
> + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +# else
> + leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
> # endif
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq %rdi, %rax
> ret
>
> .p2align 4
> -L(first_vec_x1_check):
> +L(last_vec_x1_return):
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - sall $2, %eax
> -# endif
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $VEC_SIZE, %rax
> +# ifdef USE_AS_RAWMEMCHR
> +# ifdef USE_AS_WMEMCHR
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> addq %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(first_vec_x2_check):
> - tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - sall $2, %eax
> +# endif
> +# else
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> # endif
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $(VEC_SIZE * 2), %rax
> - addq %rdi, %rax
> ret
>
> .p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x2_return):
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - sall $2, %eax
> +# ifdef USE_AS_RAWMEMCHR
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +# else
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
> # endif
> - /* Check the end of data. */
> - cmpq %rax, %rdx
> - jbe L(zero)
> - addq $(VEC_SIZE * 3), %rax
> - addq %rdi, %rax
> ret
>
> .p2align 4
> -L(zero):
> - xorl %eax, %eax
> - ret
> -# endif
> -
> - .p2align 4
> -L(first_vec_x0):
> +L(last_vec_x3_return):
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq (%rdi, %rax, 4), %rax
> +# ifdef USE_AS_RAWMEMCHR
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> # else
> - addq %rdi, %rax
> + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
> + leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
> # endif
> ret
>
> +
> +# ifndef USE_AS_RAWMEMCHR
> +L(last_4x_vec_or_less_cmpeq):
> + VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + subq $-(VEC_SIZE * 4), %rdi
> + /* Check first VEC regardless. */
> + testl %eax, %eax
> + jnz L(first_vec_x1_check)
> +
> + /* If remaining length <= CHAR_PER_VEC * 2. */
> + addl $(CHAR_PER_VEC * 2), %edx
> + jle L(last_2x_vec)
> +
> .p2align 4
> -L(first_vec_x1):
> +L(last_4x_vec):
> + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + testl %eax, %eax
> + jnz L(last_vec_x2)
> +
> +
> + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + /* Create mask for possible matches within remaining length. */
> +# ifdef USE_AS_WMEMCHR
> + movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
> + bzhil %edx, %ecx, %ecx
> +# else
> + movq $-1, %rcx
> + bzhiq %rdx, %rcx, %rcx
> +# endif
> + /* Test matches in data against length match. */
> + andl %ecx, %eax
> + jnz L(last_vec_x3)
> +
> + /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
> + remaining length was found to be > CHAR_PER_VEC * 2. */
> + subl $CHAR_PER_VEC, %edx
> + jbe L(zero_end2)
> +
> +
> + VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> + kmovd %k0, %eax
> + /* Shift remaining length mask for last VEC. */
> +# ifdef USE_AS_WMEMCHR
> + shrl $CHAR_PER_VEC, %ecx
> +# else
> + shrq $CHAR_PER_VEC, %rcx
> +# endif
> + andl %ecx, %eax
> + jz L(zero_end2)
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq VEC_SIZE(%rdi, %rax, 4), %rax
> -# else
> - addq $VEC_SIZE, %rax
> - addq %rdi, %rax
> -# endif
> + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> +L(zero_end2):
> ret
>
> - .p2align 4
> -L(first_vec_x2):
> +L(last_vec_x2):
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
> -# else
> - addq $(VEC_SIZE * 2), %rax
> - addq %rdi, %rax
> -# endif
> + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> ret
>
> .p2align 4
> -L(4x_vec_end):
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x0)
> - kmovd %k2, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x1)
> - kmovd %k3, %eax
> - testl %eax, %eax
> - jnz L(first_vec_x2)
> - kmovd %k4, %eax
> - testl %eax, %eax
> -L(first_vec_x3):
> +L(last_vec_x3):
> tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> - /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
> - leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
> -# else
> - addq $(VEC_SIZE * 3), %rax
> - addq %rdi, %rax
> -# endif
> + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> ret
> +# endif
>
> END (MEMCHR)
> #endif
> --
> 2.29.2
>
This is the patch I will push unless anyone has any objections.
^ permalink raw reply [flat|nested] 20+ messages in thread
end of thread, other threads:[~2021-05-03 22:59 UTC | newest]
Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-03 8:44 [PATCH v1 1/3] Bench: Expand bench-memchr.c Noah Goldstein
2021-05-03 8:44 ` [PATCH v1 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
2021-05-03 18:50 ` H.J. Lu
2021-05-03 20:06 ` Noah Goldstein
2021-05-03 20:06 ` [PATCH v2 " Noah Goldstein
2021-05-03 20:06 ` [PATCH v2 3/3] x86: Optimize memchr-evex.S Noah Goldstein
2021-05-03 22:26 ` H.J. Lu
2021-05-03 22:58 ` Noah Goldstein
2021-05-03 22:25 ` [PATCH v2 2/3] x86: Optimize memchr-avx2.S H.J. Lu
2021-05-03 22:58 ` Noah Goldstein
2021-05-03 22:58 ` [PATCH v3 " Noah Goldstein
2021-05-03 22:58 ` [PATCH v3 3/3] x86: Optimize memchr-evex.S Noah Goldstein
2021-05-03 22:59 ` Noah Goldstein
2021-05-03 22:59 ` [PATCH v3 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
2021-05-03 8:44 ` [PATCH v1 3/3] x86: Optimize memchr-evex.S Noah Goldstein
2021-05-03 18:58 ` H.J. Lu
2021-05-03 20:06 ` Noah Goldstein
2021-05-03 17:17 ` [PATCH v1 1/3] Bench: Expand bench-memchr.c H.J. Lu
2021-05-03 19:51 ` Noah Goldstein
2021-05-03 20:59 ` H.J. Lu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).