* [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests
@ 2022-10-29 20:19 Noah Goldstein
2022-10-29 20:19 ` [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes Noah Goldstein
` (2 more replies)
0 siblings, 3 replies; 7+ messages in thread
From: Noah Goldstein @ 2022-10-29 20:19 UTC (permalink / raw)
To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos
len=0 is valid and fairly common so should be tested.
---
benchtests/bench-memcmp.c | 18 +++++++++---------
string/test-memcmp.c | 16 ++++++++++------
2 files changed, 19 insertions(+), 15 deletions(-)
diff --git a/benchtests/bench-memcmp.c b/benchtests/bench-memcmp.c
index d64eaa992e..b2816baebe 100644
--- a/benchtests/bench-memcmp.c
+++ b/benchtests/bench-memcmp.c
@@ -63,7 +63,7 @@ IMPL (MEMCMP, 1)
static void
do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s1,
- const CHAR *s2, size_t len, int exp_result)
+ const CHAR *s2, size_t len)
{
size_t i, iters = INNER_LOOP_ITERS_LARGE;
timing_t start, stop, cur;
@@ -87,9 +87,6 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
size_t i;
CHAR *s1, *s2;
- if (len == 0)
- return;
-
align1 &= (4096 - CHARBYTES);
if (align1 + (len + 1) * CHARBYTES >= page_size)
return;
@@ -111,13 +108,16 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
for (i = 0; i < len; i++)
s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % MAX_CHAR;
- s1[len] = align1;
- s2[len] = align2;
- s2[len - 1] -= exp_result;
+ if (len)
+ {
+ s1[len] = align1;
+ s2[len] = align2;
+ s2[len - 1] -= exp_result;
+ }
FOR_EACH_IMPL (impl, 0)
{
- do_one_test (json_ctx, impl, s1, s2, len, exp_result);
+ do_one_test (json_ctx, impl, s1, s2, len);
}
json_array_end (json_ctx);
@@ -147,7 +147,7 @@ test_main (void)
json_array_end (&json_ctx);
json_array_begin (&json_ctx, "results");
- for (i = 1; i < 32; ++i)
+ for (i = 0; i < 32; ++i)
{
do_test (&json_ctx, i * CHARBYTES, i * CHARBYTES, i, 0);
do_test (&json_ctx, i * CHARBYTES, i * CHARBYTES, i, 1);
diff --git a/string/test-memcmp.c b/string/test-memcmp.c
index 181b689f68..18d8b0d9f1 100644
--- a/string/test-memcmp.c
+++ b/string/test-memcmp.c
@@ -117,9 +117,6 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result)
size_t i;
CHAR *s1, *s2;
- if (len == 0)
- return;
-
align1 &= (4096 - CHARBYTES);
if (align1 + (len + 1) * CHARBYTES >= page_size)
return;
@@ -134,9 +131,16 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result)
for (i = 0; i < len; i++)
s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % CHAR__MAX;
- s1[len] = align1;
- s2[len] = align2;
- s2[len - 1] -= exp_result;
+ if (len)
+ {
+ s1[len] = align1;
+ s2[len] = align2;
+ s2[len - 1] -= exp_result;
+ }
+ else
+ {
+ exp_result = 0;
+ }
FOR_EACH_IMPL (impl, 0)
do_one_test (impl, s1, s2, len, exp_result);
--
2.34.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes
2022-10-29 20:19 [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Noah Goldstein
@ 2022-10-29 20:19 ` Noah Goldstein
2022-10-31 15:47 ` H.J. Lu
2022-10-29 20:19 ` [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S " Noah Goldstein
2022-10-31 13:19 ` [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Siddhesh Poyarekar
2 siblings, 1 reply; 7+ messages in thread
From: Noah Goldstein @ 2022-10-29 20:19 UTC (permalink / raw)
To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos
The only change to the existing generated code is `tzcnt` -> `bsf` to
save a byte of code size here and there.
Rewriting with VMM API allows for memcmp-evex-movbe to be used with
evex512 by including "x86-evex512-vecs.h" at the top.
Complete check passes on x86-64.
---
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 308 +++++++++++--------
1 file changed, 175 insertions(+), 133 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
index bc017768be..f6c379831e 100644
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -62,44 +62,38 @@ Latency:
# define MEMCMP __memcmp_evex_movbe
# endif
-# define VMOVU vmovdqu64
+# ifndef VEC_SIZE
+# include "x86-evex256-vecs.h"
+# endif
# ifdef USE_AS_WMEMCMP
# define VMOVU_MASK vmovdqu32
# define CHAR_SIZE 4
# define VPCMP vpcmpd
+# define VPCMPEQ vpcmpeqd
# define VPTEST vptestmd
+
+# define USE_WIDE_CHAR
# else
# define VMOVU_MASK vmovdqu8
# define CHAR_SIZE 1
# define VPCMP vpcmpub
+# define VPCMPEQ vpcmpeqb
# define VPTEST vptestmb
# endif
+# include "reg-macros.h"
-# define VEC_SIZE 32
# define PAGE_SIZE 4096
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-# define XMM0 xmm16
-# define XMM1 xmm17
-# define XMM2 xmm18
-# define YMM0 ymm16
-# define XMM1 xmm17
-# define XMM2 xmm18
-# define YMM1 ymm17
-# define YMM2 ymm18
-# define YMM3 ymm19
-# define YMM4 ymm20
-# define YMM5 ymm21
-# define YMM6 ymm22
/* Warning!
wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts.
*/
- .section .text.evex,"ax",@progbits
+ .section SECTION(.text), "ax", @progbits
/* Cache align memcmp entry. This allows for much more thorough
frontend optimization. */
ENTRY_P2ALIGN (MEMCMP, 6)
@@ -111,23 +105,40 @@ ENTRY_P2ALIGN (MEMCMP, 6)
/* Fall through for [0, VEC_SIZE] as its the hottest. */
ja L(more_1x_vec)
- /* Create mask for CHAR's we want to compare. This allows us to
- avoid having to include page cross logic. */
- movl $-1, %ecx
- bzhil %edx, %ecx, %ecx
- kmovd %ecx, %k2
+ /* Create mask of bytes that are guranteed to be valid because
+ of length (edx). Using masked movs allows us to skip checks
+ for page crosses/zero size. */
+ mov $-1, %VRAX
+ bzhi %VRDX, %VRAX, %VRAX
+ /* NB: A `jz` might be useful here. Page-faults that are
+ invalidated by predicate execution (the evex mask) can be
+ very slow. The expectation is this is not the norm so and
+ "most" code will not regularly call 'memcmp' with length = 0
+ and memory that is not wired up. */
+ KMOV %VRAX, %k2
+
+
/* Safe to load full ymm with mask. */
- VMOVU_MASK (%rsi), %YMM2{%k2}
- VPCMP $4,(%rdi), %YMM2, %k1{%k2}
- kmovd %k1, %eax
- testl %eax, %eax
+ VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
+ /* Slightly different method for VEC_SIZE == 64 to save a bit of
+ code size. This allows us to fit L(return_vec_0) entirely in
+ the first cache line. */
+# if VEC_SIZE == 64
+ VPCMPEQ (%rdi), %VMM(2), %k1{%k2}
+ KMOV %k1, %VRCX
+ sub %VRCX, %VRAX
+# else
+ VPCMP $4, (%rdi), %VMM(2), %k1{%k2}
+ KMOV %k1, %VRAX
+ test %VRAX, %VRAX
+# endif
jnz L(return_vec_0)
ret
- .p2align 4
+ .p2align 4,, 11
L(return_vec_0):
- tzcntl %eax, %eax
+ bsf %VRAX, %VRAX
# ifdef USE_AS_WMEMCMP
movl (%rdi, %rax, CHAR_SIZE), %ecx
xorl %edx, %edx
@@ -138,33 +149,36 @@ L(return_vec_0):
leal -1(%rdx, %rdx), %eax
# else
movzbl (%rsi, %rax), %ecx
+# if VEC_SIZE == 64
+ movb (%rdi, %rax), %al
+# else
movzbl (%rdi, %rax), %eax
+# endif
subl %ecx, %eax
# endif
ret
-
- .p2align 4
+ .p2align 4,, 11
L(more_1x_vec):
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU (%rsi), %YMM1
+ VMOVU (%rsi), %VMM(1)
/* Use compare not equals to directly check for mismatch. */
- VPCMP $4,(%rdi), %YMM1, %k1
- kmovd %k1, %eax
+ VPCMP $4, (%rdi), %VMM(1), %k1
+ KMOV %k1, %VRAX
/* NB: eax must be destination register if going to
- L(return_vec_[0,2]). For L(return_vec_3) destination register
- must be ecx. */
- testl %eax, %eax
+ L(return_vec_[0,2]). For L(return_vec_3) destination
+ register must be ecx. */
+ test %VRAX, %VRAX
jnz L(return_vec_0)
cmpq $(CHAR_PER_VEC * 2), %rdx
jbe L(last_1x_vec)
/* Check second VEC no matter what. */
- VMOVU VEC_SIZE(%rsi), %YMM2
- VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+ VMOVU VEC_SIZE(%rsi), %VMM(2)
+ VPCMP $4, VEC_SIZE(%rdi), %VMM(2), %k1
+ KMOV %k1, %VRAX
+ test %VRAX, %VRAX
jnz L(return_vec_1)
/* Less than 4 * VEC. */
@@ -172,16 +186,16 @@ L(more_1x_vec):
jbe L(last_2x_vec)
/* Check third and fourth VEC no matter what. */
- VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
- VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+ VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3)
+ VPCMP $4, (VEC_SIZE * 2)(%rdi), %VMM(3), %k1
+ KMOV %k1, %VRAX
+ test %VRAX, %VRAX
jnz L(return_vec_2)
- VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
- VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
- kmovd %k1, %ecx
- testl %ecx, %ecx
+ VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4)
+ VPCMP $4, (VEC_SIZE * 3)(%rdi), %VMM(4), %k1
+ KMOV %k1, %VRCX
+ test %VRCX, %VRCX
jnz L(return_vec_3)
/* Go to 4x VEC loop. */
@@ -192,8 +206,8 @@ L(more_1x_vec):
branches. */
/* Load first two VEC from s2 before adjusting addresses. */
- VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
- VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(2)
leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
@@ -202,56 +216,61 @@ L(more_1x_vec):
/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
will have some 1s. */
- vpxorq (%rdi), %YMM1, %YMM1
- vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2
+ vpxorq (%rdi), %VMM(1), %VMM(1)
+ vpxorq (VEC_SIZE)(%rdi), %VMM(2), %VMM(2)
- VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
- vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+ VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3)
+ vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
- VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
- /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
- oring with YMM1. Result is stored in YMM4. */
- vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4)
+ /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with VEC(4) while
+ oring with VEC(1). Result is stored in VEC(4). */
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4)
- /* Or together YMM2, YMM3, and YMM4 into YMM4. */
- vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ /* Or together VEC(2), VEC(3), and VEC(4) into VEC(4). */
+ vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
- /* Test YMM4 against itself. Store any CHAR mismatches in k1.
+ /* Test VEC(4) against itself. Store any CHAR mismatches in k1.
*/
- VPTEST %YMM4, %YMM4, %k1
+ VPTEST %VMM(4), %VMM(4), %k1
/* k1 must go to ecx for L(return_vec_0_1_2_3). */
- kmovd %k1, %ecx
- testl %ecx, %ecx
+ KMOV %k1, %VRCX
+ test %VRCX, %VRCX
jnz L(return_vec_0_1_2_3)
/* NB: eax must be zero to reach here. */
ret
- .p2align 4,, 8
+ .p2align 4,, 9
L(8x_end_return_vec_0_1_2_3):
movq %rdx, %rdi
L(8x_return_vec_0_1_2_3):
+ /* L(loop_4x_vec) leaves result in `k1` for VEC_SIZE == 64. */
+# if VEC_SIZE == 64
+ KMOV %k1, %VRCX
+# endif
addq %rdi, %rsi
L(return_vec_0_1_2_3):
- VPTEST %YMM1, %YMM1, %k0
- kmovd %k0, %eax
- testl %eax, %eax
+ VPTEST %VMM(1), %VMM(1), %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
jnz L(return_vec_0)
- VPTEST %YMM2, %YMM2, %k0
- kmovd %k0, %eax
- testl %eax, %eax
+ VPTEST %VMM(2), %VMM(2), %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
jnz L(return_vec_1)
- VPTEST %YMM3, %YMM3, %k0
- kmovd %k0, %eax
- testl %eax, %eax
+ VPTEST %VMM(3), %VMM(3), %k0
+ KMOV %k0, %VRAX
+ test %VRAX, %VRAX
jnz L(return_vec_2)
+ .p2align 4,, 2
L(return_vec_3):
/* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
line. */
- bsfl %ecx, %ecx
+ bsf %VRCX, %VRCX
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
xorl %edx, %edx
@@ -266,11 +285,11 @@ L(return_vec_3):
ret
- .p2align 4
+ .p2align 4,, 8
L(return_vec_1):
/* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
fetch block. */
- bsfl %eax, %eax
+ bsf %VRAX, %VRAX
# ifdef USE_AS_WMEMCMP
movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
xorl %edx, %edx
@@ -284,11 +303,11 @@ L(return_vec_1):
# endif
ret
- .p2align 4,, 10
+ .p2align 4,, 7
L(return_vec_2):
/* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
fetch block. */
- bsfl %eax, %eax
+ bsf %VRAX, %VRAX
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
xorl %edx, %edx
@@ -302,7 +321,7 @@ L(return_vec_2):
# endif
ret
- .p2align 4
+ .p2align 4,, 8
L(more_8x_vec):
/* Set end of s1 in rdx. */
leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
@@ -316,62 +335,82 @@ L(more_8x_vec):
.p2align 4
L(loop_4x_vec):
- VMOVU (%rsi, %rdi), %YMM1
- vpxorq (%rdi), %YMM1, %YMM1
- VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
- vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2
- VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
- vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
- VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
- vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
- vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
- VPTEST %YMM4, %YMM4, %k1
- kmovd %k1, %ecx
- testl %ecx, %ecx
+ VMOVU (%rsi, %rdi), %VMM(1)
+ vpxorq (%rdi), %VMM(1), %VMM(1)
+ VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2)
+ vpxorq VEC_SIZE(%rdi), %VMM(2), %VMM(2)
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
+ vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4)
+ vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
+ VPTEST %VMM(4), %VMM(4), %k1
+ /* If VEC_SIZE == 64 just branch with KTEST. We have free port0
+ space and it allows the loop to fit in 2x cache lines
+ instead of 3. */
+# if VEC_SIZE == 64
+ KTEST %k1, %k1
+# else
+ KMOV %k1, %VRCX
+ test %VRCX, %VRCX
+# endif
jnz L(8x_return_vec_0_1_2_3)
subq $-(VEC_SIZE * 4), %rdi
cmpq %rdx, %rdi
jb L(loop_4x_vec)
-
subq %rdx, %rdi
/* rdi has 4 * VEC_SIZE - remaining length. */
cmpl $(VEC_SIZE * 3), %edi
- jae L(8x_last_1x_vec)
+ jge L(8x_last_1x_vec)
/* Load regardless of branch. */
- VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
- cmpl $(VEC_SIZE * 2), %edi
- jae L(8x_last_2x_vec)
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
- vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
-
- VMOVU (%rsi, %rdx), %YMM1
- vpxorq (%rdx), %YMM1, %YMM1
+ /* Seperate logic as we can only use testb for VEC_SIZE == 64.
+ */
+# if VEC_SIZE == 64
+ testb %dil, %dil
+ js L(8x_last_2x_vec)
+# else
+ cmpl $(VEC_SIZE * 2), %edi
+ jge L(8x_last_2x_vec)
+# endif
- VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
- vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
- VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
- vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
- vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
- VPTEST %YMM4, %YMM4, %k1
- kmovd %k1, %ecx
- testl %ecx, %ecx
+ vpxorq (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(3)
+
+ VMOVU (%rsi, %rdx), %VMM(1)
+ vpxorq (%rdx), %VMM(1), %VMM(1)
+
+ VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2)
+ vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2)
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %VMM(1), %VMM(4)
+ vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
+ VPTEST %VMM(4), %VMM(4), %k1
+ /* L(8x_end_return_vec_0_1_2_3) expects bitmask to still be in
+ `k1` if VEC_SIZE == 64. */
+# if VEC_SIZE == 64
+ KTEST %k1, %k1
+# else
+ KMOV %k1, %VRCX
+ test %VRCX, %VRCX
+# endif
jnz L(8x_end_return_vec_0_1_2_3)
/* NB: eax must be zero to reach here. */
ret
/* Only entry is from L(more_8x_vec). */
- .p2align 4,, 10
+ .p2align 4,, 6
L(8x_last_2x_vec):
- VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+ VPCMP $4, (VEC_SIZE * 2)(%rdx), %VMM(3), %k1
+ KMOV %k1, %VRAX
+ test %VRAX, %VRAX
jnz L(8x_return_vec_2)
- /* Naturally aligned to 16 bytes. */
+ .p2align 4,, 5
L(8x_last_1x_vec):
- VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
- VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
+ VPCMP $4, (VEC_SIZE * 3)(%rdx), %VMM(1), %k1
+ KMOV %k1, %VRAX
+ test %VRAX, %VRAX
jnz L(8x_return_vec_3)
ret
@@ -383,7 +422,7 @@ L(8x_last_1x_vec):
L(8x_return_vec_2):
subq $VEC_SIZE, %rdx
L(8x_return_vec_3):
- bsfl %eax, %eax
+ bsf %VRAX, %VRAX
# ifdef USE_AS_WMEMCMP
leaq (%rdx, %rax, CHAR_SIZE), %rax
movl (VEC_SIZE * 3)(%rax), %ecx
@@ -399,32 +438,34 @@ L(8x_return_vec_3):
# endif
ret
- .p2align 4,, 10
+ .p2align 4,, 8
L(last_2x_vec):
/* Check second to last VEC. */
- VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
- VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
+ VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1
+ KMOV %k1, %VRAX
+ test %VRAX, %VRAX
jnz L(return_vec_1_end)
/* Check last VEC. */
- .p2align 4
+ .p2align 4,, 8
L(last_1x_vec):
- VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
- VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
+ VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1
+ KMOV %k1, %VRAX
+ test %VRAX, %VRAX
jnz L(return_vec_0_end)
ret
- /* Don't align. Takes 2-fetch blocks either way and aligning
- will cause code to spill into another cacheline. */
+ /* Don't fully align. Takes 2-fetch blocks either way and
+ aligning will cause code to spill into another cacheline.
+ */
+ .p2align 4,, 3
L(return_vec_1_end):
/* Use bsf to save code size. This is necessary to have
L(one_or_less) fit in aligning bytes between. */
- bsfl %eax, %eax
+ bsf %VRAX, %VRAX
addl %edx, %eax
# ifdef USE_AS_WMEMCMP
movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
@@ -439,10 +480,11 @@ L(return_vec_1_end):
# endif
ret
+ .p2align 4,, 2
/* Don't align. Takes 2-fetch blocks either way and aligning
will cause code to spill into another cacheline. */
L(return_vec_0_end):
- tzcntl %eax, %eax
+ bsf %VRAX, %VRAX
addl %edx, %eax
# ifdef USE_AS_WMEMCMP
movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
@@ -456,7 +498,7 @@ L(return_vec_0_end):
subl %ecx, %eax
# endif
ret
- /* 1-byte until next cache line. */
-
+ /* evex256: 2-byte until next cache line. evex512: 46-bytes
+ until next cache line. */
END (MEMCMP)
#endif
--
2.34.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S and minor changes
2022-10-29 20:19 [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Noah Goldstein
2022-10-29 20:19 ` [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes Noah Goldstein
@ 2022-10-29 20:19 ` Noah Goldstein
2022-10-31 15:48 ` H.J. Lu
2022-10-31 13:19 ` [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Siddhesh Poyarekar
2 siblings, 1 reply; 7+ messages in thread
From: Noah Goldstein @ 2022-10-29 20:19 UTC (permalink / raw)
To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos
Changes to generated code are:
1. In a few places use `vpcmpeqb` instead of `vpcmpneq` to save a
byte of code size.
2. Add a branch for length <= (VEC_SIZE * 6) as opposed to doing
the entire block of [VEC_SIZE * 4 + 1, VEC_SIZE * 8] in a
single basic-block (the space to add the extra branch without
changing code size is bought with the above change).
Change (2) has roughly a 20-25% speedup for sizes in [VEC_SIZE * 4 +
1, VEC_SIZE * 6] and negligible to no-cost for [VEC_SIZE * 6 + 1,
VEC_SIZE * 8]
From N=10 runs on Tigerlake:
align1,align2 ,length ,result ,New TIme ,Cur Time,New Time / Old Time
0 ,0 ,129 ,0 ,5.404 ,6.887 ,0.785
0 ,0 ,129 ,1 ,5.308 ,6.826 ,0.778
0 ,0 ,129 ,18446744073709551615 ,5.359 ,6.823 ,0.785
0 ,0 ,161 ,0 ,5.284 ,6.827 ,0.774
0 ,0 ,161 ,1 ,5.317 ,6.745 ,0.788
0 ,0 ,161 ,18446744073709551615 ,5.406 ,6.778 ,0.798
0 ,0 ,193 ,0 ,6.969 ,6.832 ,1.000
0 ,0 ,193 ,1 ,6.943 ,6.748 ,1.029
0 ,0 ,193 ,18446744073709551615 ,6.997 ,6.728 ,1.011
0 ,0 ,225 ,0 ,7.144 ,6.746 ,0.989
0 ,0 ,225 ,1 ,7.218 ,6.683 ,1.003
0 ,0 ,225 ,18446744073709551615 ,6.864 ,6.767 ,0.992
0 ,0 ,256 ,0 ,5.423 ,5.482 ,0.989
0 ,0 ,256 ,1 ,5.348 ,5.465 ,0.978
0 ,0 ,256 ,18446744073709551615 ,5.321 ,5.518 ,0.964
Rewriting with VMM API allows for memcmpeq-evex to be used with
evex512 by including "x86-evex512-vecs.h" at the top.
Complete check passes on x86-64.
---
sysdeps/x86_64/multiarch/memcmpeq-evex.S | 255 ++++++++++++++---------
1 file changed, 155 insertions(+), 100 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
index 41124ef1d3..671d19393e 100644
--- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S
+++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
@@ -41,24 +41,53 @@
# define MEMCMPEQ __memcmpeq_evex
# endif
+# ifndef VEC_SIZE
+# include "x86-evex512-vecs.h"
+# endif
+# include "reg-macros.h"
+
+
+# if VEC_SIZE == 32
+
+# define TEST_ZERO_VCMP(reg) inc %VGPR(reg)
+# define TEST_ZERO(reg) test %VGPR(reg), %VGPR(reg)
+
+# define TO_32BIT_P1(reg) /* Do nothing. */
+# define TO_32BIT_P2(reg) /* Do nothing. */
+# define TO_32BIT(reg) /* Do nothing. */
+
+# define VEC_CMP VPCMPEQ
+
+# elif VEC_SIZE == 64
+
+# define TEST_ZERO_VCMP(reg) TEST_ZERO(reg)
+# define TEST_ZERO(reg) neg %VGPR(reg)
+
+
+ /* VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit
+ int. We have two methods for this. If the mask with branched
+ on, we use `neg` for the branch then `sbb` to get the 32-bit
+ return. If the mask was no branched on, we just use
+ `popcntq`. */
+# define TO_32BIT_P1(reg) TEST_ZERO(reg)
+# define TO_32BIT_P2(reg) sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32)
+# define TO_32BIT(reg) popcntq %reg, %reg
+
+# define VEC_CMP VPCMPNEQ
+
+# else
+# error "Unsupported VEC_SIZE"
+# endif
+
+
# define VMOVU_MASK vmovdqu8
-# define VMOVU vmovdqu64
-# define VPCMP vpcmpub
+# define VPCMPNEQ vpcmpneqb
+# define VPCMPEQ vpcmpeqb
# define VPTEST vptestmb
-# define VEC_SIZE 32
# define PAGE_SIZE 4096
-# define YMM0 ymm16
-# define YMM1 ymm17
-# define YMM2 ymm18
-# define YMM3 ymm19
-# define YMM4 ymm20
-# define YMM5 ymm21
-# define YMM6 ymm22
-
-
- .section .text.evex, "ax", @progbits
+ .section SECTION(.text), "ax", @progbits
ENTRY_P2ALIGN (MEMCMPEQ, 6)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
@@ -69,47 +98,54 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
ja L(more_1x_vec)
/* Create mask of bytes that are guranteed to be valid because
- of length (edx). Using masked movs allows us to skip checks for
- page crosses/zero size. */
- movl $-1, %ecx
- bzhil %edx, %ecx, %ecx
- kmovd %ecx, %k2
+ of length (edx). Using masked movs allows us to skip checks
+ for page crosses/zero size. */
+ mov $-1, %VRAX
+ bzhi %VRDX, %VRAX, %VRAX
+ /* NB: A `jz` might be useful here. Page-faults that are
+ invalidated by predicate execution (the evex mask) can be
+ very slow. The expectation is this is not the norm so and
+ "most" code will not regularly call 'memcmp' with length = 0
+ and memory that is not wired up. */
+ KMOV %VRAX, %k2
/* Use masked loads as VEC_SIZE could page cross where length
(edx) would not. */
- VMOVU_MASK (%rsi), %YMM2{%k2}
- VPCMP $4,(%rdi), %YMM2, %k1{%k2}
- kmovd %k1, %eax
+ VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
+ VPCMPNEQ (%rdi), %VMM(2), %k1{%k2}
+ KMOV %k1, %VRAX
+ TO_32BIT (VRAX)
ret
-
+ .p2align 4,, 3
L(last_1x_vec):
- VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
- VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
- kmovd %k1, %eax
+ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
+ VPCMPNEQ -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %k1
+ KMOV %k1, %VRAX
+ TO_32BIT_P1 (rax)
L(return_neq0):
+ TO_32BIT_P2 (rax)
ret
-
- .p2align 4
+ .p2align 4,, 12
L(more_1x_vec):
/* From VEC + 1 to 2 * VEC. */
- VMOVU (%rsi), %YMM1
+ VMOVU (%rsi), %VMM(1)
/* Use compare not equals to directly check for mismatch. */
- VPCMP $4,(%rdi), %YMM1, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+ VPCMPNEQ (%rdi), %VMM(1), %k1
+ KMOV %k1, %VRAX
+ TEST_ZERO (rax)
jnz L(return_neq0)
cmpq $(VEC_SIZE * 2), %rdx
jbe L(last_1x_vec)
/* Check second VEC no matter what. */
- VMOVU VEC_SIZE(%rsi), %YMM2
- VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+ VMOVU VEC_SIZE(%rsi), %VMM(2)
+ VPCMPNEQ VEC_SIZE(%rdi), %VMM(2), %k1
+ KMOV %k1, %VRAX
+ TEST_ZERO (rax)
jnz L(return_neq0)
/* Less than 4 * VEC. */
@@ -117,16 +153,16 @@ L(more_1x_vec):
jbe L(last_2x_vec)
/* Check third and fourth VEC no matter what. */
- VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
- VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+ VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3)
+ VEC_CMP (VEC_SIZE * 2)(%rdi), %VMM(3), %k1
+ KMOV %k1, %VRAX
+ TEST_ZERO_VCMP (rax)
jnz L(return_neq0)
- VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
- VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+ VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4)
+ VEC_CMP (VEC_SIZE * 3)(%rdi), %VMM(4), %k1
+ KMOV %k1, %VRAX
+ TEST_ZERO_VCMP (rax)
jnz L(return_neq0)
/* Go to 4x VEC loop. */
@@ -136,8 +172,8 @@ L(more_1x_vec):
/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
branches. */
- VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %YMM1
- VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %YMM2
+ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
addq %rdx, %rdi
/* Wait to load from s1 until addressed adjust due to
@@ -145,26 +181,32 @@ L(more_1x_vec):
/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
will have some 1s. */
- vpxorq -(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1
- /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while
- oring with YMM1. Result is stored in YMM1. */
- vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2
-
- VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
- vpxorq -(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
- /* Or together YMM1, YMM2, and YMM3 into YMM3. */
- VMOVU -(VEC_SIZE)(%rsi, %rdx), %YMM4
- vpxorq -(VEC_SIZE)(%rdi), %YMM4, %YMM4
-
- /* Or together YMM2, YMM3, and YMM4 into YMM4. */
- vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
-
- /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
- VPTEST %YMM4, %YMM4, %k1
- kmovd %k1, %eax
+ vpxorq -(VEC_SIZE * 1)(%rdi), %VMM(1), %VMM(1)
+ /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while
+ oring with VEC(1). Result is stored in VEC(1). */
+ vpternlogd $0xde, -(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(2)
+
+ cmpl $(VEC_SIZE * 6), %edx
+ jbe L(4x_last_2x_vec)
+
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(3)
+ vpxorq -(VEC_SIZE * 3)(%rdi), %VMM(3), %VMM(3)
+ /* Or together VEC(1), VEC(2), and VEC(3) into VEC(3). */
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(4)
+ vpxorq -(VEC_SIZE * 4)(%rdi), %VMM(4), %VMM(4)
+
+ /* Or together VEC(4), VEC(3), and VEC(2) into VEC(2). */
+ vpternlogd $0xfe, %VMM(4), %VMM(3), %VMM(2)
+
+ /* Compare VEC(4) with 0. If any 1s s1 and s2 don't match. */
+L(4x_last_2x_vec):
+ VPTEST %VMM(2), %VMM(2), %k1
+ KMOV %k1, %VRAX
+ TO_32BIT (VRAX)
ret
- .p2align 4
+
+ .p2align 4,, 10
L(more_8x_vec):
/* Set end of s1 in rdx. */
leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
@@ -175,67 +217,80 @@ L(more_8x_vec):
andq $-VEC_SIZE, %rdi
/* Adjust because first 4x vec where check already. */
subq $-(VEC_SIZE * 4), %rdi
- .p2align 4
+ .p2align 5,, 12
+ .p2align 4,, 8
L(loop_4x_vec):
- VMOVU (%rsi, %rdi), %YMM1
- vpxorq (%rdi), %YMM1, %YMM1
+ VMOVU (%rsi, %rdi), %VMM(1)
+ vpxorq (%rdi), %VMM(1), %VMM(1)
- VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
- vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
+ VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2)
+ vpternlogd $0xde, (VEC_SIZE)(%rdi), %VMM(1), %VMM(2)
- VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
- vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
+ vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
- VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
- vpxorq (VEC_SIZE * 3)(%rdi), %YMM4, %YMM4
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
+ vpxorq (VEC_SIZE * 3)(%rdi), %VMM(4), %VMM(4)
- vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
- VPTEST %YMM4, %YMM4, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+ vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
+ VPTEST %VMM(4), %VMM(4), %k1
+ KMOV %k1, %VRAX
+ TEST_ZERO (rax)
jnz L(return_neq2)
subq $-(VEC_SIZE * 4), %rdi
cmpq %rdx, %rdi
jb L(loop_4x_vec)
subq %rdx, %rdi
- VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
- vpxorq (VEC_SIZE * 3)(%rdx), %YMM4, %YMM4
+
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
+ vpxorq (VEC_SIZE * 3)(%rdx), %VMM(4), %VMM(4)
/* rdi has 4 * VEC_SIZE - remaining length. */
- cmpl $(VEC_SIZE * 3), %edi
- jae L(8x_last_1x_vec)
+
/* Load regardless of branch. */
- VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
- /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
- oring with YMM4. Result is stored in YMM4. */
- vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
+ /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while
+ oring with VEC(4). Result is stored in VEC(4). */
+ vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4)
+
+ /* Seperate logic as we can only use testb for VEC_SIZE == 64.
+ */
+# if VEC_SIZE == 64
+ testb %dil, %dil
+ js L(8x_last_2x_vec)
+# else
cmpl $(VEC_SIZE * 2), %edi
- jae L(8x_last_2x_vec)
+ jge L(8x_last_2x_vec)
+# endif
- VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
- vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
+ VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2)
+ vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2)
- VMOVU (%rsi, %rdx), %YMM1
- vpxorq (%rdx), %YMM1, %YMM1
+ VMOVU (%rsi, %rdx), %VMM(1)
+ vpxorq (%rdx), %VMM(1), %VMM(1)
- vpternlogd $0xfe, %YMM1, %YMM2, %YMM4
+ vpternlogd $0xfe, %VMM(1), %VMM(2), %VMM(4)
L(8x_last_1x_vec):
L(8x_last_2x_vec):
- VPTEST %YMM4, %YMM4, %k1
- kmovd %k1, %eax
+ VPTEST %VMM(4), %VMM(4), %k1
+ KMOV %k1, %VRAX
+ TO_32BIT_P1 (rax)
L(return_neq2):
+ TO_32BIT_P2 (rax)
ret
- .p2align 4,, 8
+ .p2align 4,, 4
L(last_2x_vec):
- VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
- vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
- VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
- vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
- VPTEST %YMM2, %YMM2, %k1
- kmovd %k1, %eax
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(1)
+ vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %VMM(1), %VMM(1)
+ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2)
+ vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %VMM(2)
+ VPTEST %VMM(2), %VMM(2), %k1
+ KMOV %k1, %VRAX
+ TO_32BIT (VRAX)
ret
- /* 1 Bytes from next cache line. */
+ /* evex256: 1 Bytes from next cache line. evex512: 15 Bytes from
+ next cache line. */
END (MEMCMPEQ)
#endif
--
2.34.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests
2022-10-29 20:19 [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Noah Goldstein
2022-10-29 20:19 ` [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes Noah Goldstein
2022-10-29 20:19 ` [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S " Noah Goldstein
@ 2022-10-31 13:19 ` Siddhesh Poyarekar
2 siblings, 0 replies; 7+ messages in thread
From: Siddhesh Poyarekar @ 2022-10-31 13:19 UTC (permalink / raw)
To: Noah Goldstein, libc-alpha
LGTM.
Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
On 2022-10-29 16:19, Noah Goldstein via Libc-alpha wrote:
> len=0 is valid and fairly common so should be tested.
> ---
> benchtests/bench-memcmp.c | 18 +++++++++---------
> string/test-memcmp.c | 16 ++++++++++------
> 2 files changed, 19 insertions(+), 15 deletions(-)
>
> diff --git a/benchtests/bench-memcmp.c b/benchtests/bench-memcmp.c
> index d64eaa992e..b2816baebe 100644
> --- a/benchtests/bench-memcmp.c
> +++ b/benchtests/bench-memcmp.c
> @@ -63,7 +63,7 @@ IMPL (MEMCMP, 1)
>
> static void
> do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s1,
> - const CHAR *s2, size_t len, int exp_result)
> + const CHAR *s2, size_t len)
> {
> size_t i, iters = INNER_LOOP_ITERS_LARGE;
> timing_t start, stop, cur;
> @@ -87,9 +87,6 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
> size_t i;
> CHAR *s1, *s2;
>
> - if (len == 0)
> - return;
> -
> align1 &= (4096 - CHARBYTES);
> if (align1 + (len + 1) * CHARBYTES >= page_size)
> return;
> @@ -111,13 +108,16 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len,
> for (i = 0; i < len; i++)
> s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % MAX_CHAR;
>
> - s1[len] = align1;
> - s2[len] = align2;
> - s2[len - 1] -= exp_result;
> + if (len)
> + {
> + s1[len] = align1;
> + s2[len] = align2;
> + s2[len - 1] -= exp_result;
> + }
>
> FOR_EACH_IMPL (impl, 0)
> {
> - do_one_test (json_ctx, impl, s1, s2, len, exp_result);
> + do_one_test (json_ctx, impl, s1, s2, len);
> }
>
> json_array_end (json_ctx);
> @@ -147,7 +147,7 @@ test_main (void)
> json_array_end (&json_ctx);
>
> json_array_begin (&json_ctx, "results");
> - for (i = 1; i < 32; ++i)
> + for (i = 0; i < 32; ++i)
> {
> do_test (&json_ctx, i * CHARBYTES, i * CHARBYTES, i, 0);
> do_test (&json_ctx, i * CHARBYTES, i * CHARBYTES, i, 1);
> diff --git a/string/test-memcmp.c b/string/test-memcmp.c
> index 181b689f68..18d8b0d9f1 100644
> --- a/string/test-memcmp.c
> +++ b/string/test-memcmp.c
> @@ -117,9 +117,6 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result)
> size_t i;
> CHAR *s1, *s2;
>
> - if (len == 0)
> - return;
> -
> align1 &= (4096 - CHARBYTES);
> if (align1 + (len + 1) * CHARBYTES >= page_size)
> return;
> @@ -134,9 +131,16 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result)
> for (i = 0; i < len; i++)
> s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % CHAR__MAX;
>
> - s1[len] = align1;
> - s2[len] = align2;
> - s2[len - 1] -= exp_result;
> + if (len)
> + {
> + s1[len] = align1;
> + s2[len] = align2;
> + s2[len - 1] -= exp_result;
> + }
> + else
> + {
> + exp_result = 0;
> + }
>
> FOR_EACH_IMPL (impl, 0)
> do_one_test (impl, s1, s2, len, exp_result);
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes
2022-10-29 20:19 ` [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes Noah Goldstein
@ 2022-10-31 15:47 ` H.J. Lu
0 siblings, 0 replies; 7+ messages in thread
From: H.J. Lu @ 2022-10-31 15:47 UTC (permalink / raw)
To: Noah Goldstein; +Cc: libc-alpha, carlos
On Sat, Oct 29, 2022 at 1:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The only change to the existing generated code is `tzcnt` -> `bsf` to
> save a byte of code size here and there.
>
> Rewriting with VMM API allows for memcmp-evex-movbe to be used with
> evex512 by including "x86-evex512-vecs.h" at the top.
>
> Complete check passes on x86-64.
> ---
> sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 308 +++++++++++--------
> 1 file changed, 175 insertions(+), 133 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> index bc017768be..f6c379831e 100644
> --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> @@ -62,44 +62,38 @@ Latency:
> # define MEMCMP __memcmp_evex_movbe
> # endif
>
> -# define VMOVU vmovdqu64
> +# ifndef VEC_SIZE
> +# include "x86-evex256-vecs.h"
> +# endif
>
> # ifdef USE_AS_WMEMCMP
> # define VMOVU_MASK vmovdqu32
> # define CHAR_SIZE 4
> # define VPCMP vpcmpd
> +# define VPCMPEQ vpcmpeqd
> # define VPTEST vptestmd
> +
> +# define USE_WIDE_CHAR
> # else
> # define VMOVU_MASK vmovdqu8
> # define CHAR_SIZE 1
> # define VPCMP vpcmpub
> +# define VPCMPEQ vpcmpeqb
> # define VPTEST vptestmb
> # endif
>
> +# include "reg-macros.h"
>
> -# define VEC_SIZE 32
> # define PAGE_SIZE 4096
> # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
>
> -# define XMM0 xmm16
> -# define XMM1 xmm17
> -# define XMM2 xmm18
> -# define YMM0 ymm16
> -# define XMM1 xmm17
> -# define XMM2 xmm18
> -# define YMM1 ymm17
> -# define YMM2 ymm18
> -# define YMM3 ymm19
> -# define YMM4 ymm20
> -# define YMM5 ymm21
> -# define YMM6 ymm22
>
> /* Warning!
> wmemcmp has to use SIGNED comparison for elements.
> memcmp has to use UNSIGNED comparison for elemnts.
> */
>
> - .section .text.evex,"ax",@progbits
> + .section SECTION(.text), "ax", @progbits
> /* Cache align memcmp entry. This allows for much more thorough
> frontend optimization. */
> ENTRY_P2ALIGN (MEMCMP, 6)
> @@ -111,23 +105,40 @@ ENTRY_P2ALIGN (MEMCMP, 6)
> /* Fall through for [0, VEC_SIZE] as its the hottest. */
> ja L(more_1x_vec)
>
> - /* Create mask for CHAR's we want to compare. This allows us to
> - avoid having to include page cross logic. */
> - movl $-1, %ecx
> - bzhil %edx, %ecx, %ecx
> - kmovd %ecx, %k2
> + /* Create mask of bytes that are guranteed to be valid because
> + of length (edx). Using masked movs allows us to skip checks
> + for page crosses/zero size. */
> + mov $-1, %VRAX
> + bzhi %VRDX, %VRAX, %VRAX
> + /* NB: A `jz` might be useful here. Page-faults that are
> + invalidated by predicate execution (the evex mask) can be
> + very slow. The expectation is this is not the norm so and
> + "most" code will not regularly call 'memcmp' with length = 0
> + and memory that is not wired up. */
> + KMOV %VRAX, %k2
> +
> +
>
> /* Safe to load full ymm with mask. */
> - VMOVU_MASK (%rsi), %YMM2{%k2}
> - VPCMP $4,(%rdi), %YMM2, %k1{%k2}
> - kmovd %k1, %eax
> - testl %eax, %eax
> + VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
> + /* Slightly different method for VEC_SIZE == 64 to save a bit of
> + code size. This allows us to fit L(return_vec_0) entirely in
> + the first cache line. */
> +# if VEC_SIZE == 64
> + VPCMPEQ (%rdi), %VMM(2), %k1{%k2}
> + KMOV %k1, %VRCX
> + sub %VRCX, %VRAX
> +# else
> + VPCMP $4, (%rdi), %VMM(2), %k1{%k2}
> + KMOV %k1, %VRAX
> + test %VRAX, %VRAX
> +# endif
> jnz L(return_vec_0)
> ret
>
> - .p2align 4
> + .p2align 4,, 11
> L(return_vec_0):
> - tzcntl %eax, %eax
> + bsf %VRAX, %VRAX
> # ifdef USE_AS_WMEMCMP
> movl (%rdi, %rax, CHAR_SIZE), %ecx
> xorl %edx, %edx
> @@ -138,33 +149,36 @@ L(return_vec_0):
> leal -1(%rdx, %rdx), %eax
> # else
> movzbl (%rsi, %rax), %ecx
> +# if VEC_SIZE == 64
> + movb (%rdi, %rax), %al
> +# else
> movzbl (%rdi, %rax), %eax
> +# endif
> subl %ecx, %eax
> # endif
> ret
>
> -
> - .p2align 4
> + .p2align 4,, 11
> L(more_1x_vec):
> /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
> - VMOVU (%rsi), %YMM1
> + VMOVU (%rsi), %VMM(1)
> /* Use compare not equals to directly check for mismatch. */
> - VPCMP $4,(%rdi), %YMM1, %k1
> - kmovd %k1, %eax
> + VPCMP $4, (%rdi), %VMM(1), %k1
> + KMOV %k1, %VRAX
> /* NB: eax must be destination register if going to
> - L(return_vec_[0,2]). For L(return_vec_3) destination register
> - must be ecx. */
> - testl %eax, %eax
> + L(return_vec_[0,2]). For L(return_vec_3) destination
> + register must be ecx. */
> + test %VRAX, %VRAX
> jnz L(return_vec_0)
>
> cmpq $(CHAR_PER_VEC * 2), %rdx
> jbe L(last_1x_vec)
>
> /* Check second VEC no matter what. */
> - VMOVU VEC_SIZE(%rsi), %YMM2
> - VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> + VMOVU VEC_SIZE(%rsi), %VMM(2)
> + VPCMP $4, VEC_SIZE(%rdi), %VMM(2), %k1
> + KMOV %k1, %VRAX
> + test %VRAX, %VRAX
> jnz L(return_vec_1)
>
> /* Less than 4 * VEC. */
> @@ -172,16 +186,16 @@ L(more_1x_vec):
> jbe L(last_2x_vec)
>
> /* Check third and fourth VEC no matter what. */
> - VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
> - VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> + VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3)
> + VPCMP $4, (VEC_SIZE * 2)(%rdi), %VMM(3), %k1
> + KMOV %k1, %VRAX
> + test %VRAX, %VRAX
> jnz L(return_vec_2)
>
> - VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
> - VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
> - kmovd %k1, %ecx
> - testl %ecx, %ecx
> + VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4)
> + VPCMP $4, (VEC_SIZE * 3)(%rdi), %VMM(4), %k1
> + KMOV %k1, %VRCX
> + test %VRCX, %VRCX
> jnz L(return_vec_3)
>
> /* Go to 4x VEC loop. */
> @@ -192,8 +206,8 @@ L(more_1x_vec):
> branches. */
>
> /* Load first two VEC from s2 before adjusting addresses. */
> - VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
> - VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
> + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(2)
> leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
> leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
>
> @@ -202,56 +216,61 @@ L(more_1x_vec):
>
> /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
> will have some 1s. */
> - vpxorq (%rdi), %YMM1, %YMM1
> - vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2
> + vpxorq (%rdi), %VMM(1), %VMM(1)
> + vpxorq (VEC_SIZE)(%rdi), %VMM(2), %VMM(2)
>
> - VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
> - vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> + VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3)
> + vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
>
> - VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
> - /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
> - oring with YMM1. Result is stored in YMM4. */
> - vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> + VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4)
> + /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with VEC(4) while
> + oring with VEC(1). Result is stored in VEC(4). */
> + vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4)
>
> - /* Or together YMM2, YMM3, and YMM4 into YMM4. */
> - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> + /* Or together VEC(2), VEC(3), and VEC(4) into VEC(4). */
> + vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
>
> - /* Test YMM4 against itself. Store any CHAR mismatches in k1.
> + /* Test VEC(4) against itself. Store any CHAR mismatches in k1.
> */
> - VPTEST %YMM4, %YMM4, %k1
> + VPTEST %VMM(4), %VMM(4), %k1
> /* k1 must go to ecx for L(return_vec_0_1_2_3). */
> - kmovd %k1, %ecx
> - testl %ecx, %ecx
> + KMOV %k1, %VRCX
> + test %VRCX, %VRCX
> jnz L(return_vec_0_1_2_3)
> /* NB: eax must be zero to reach here. */
> ret
>
>
> - .p2align 4,, 8
> + .p2align 4,, 9
> L(8x_end_return_vec_0_1_2_3):
> movq %rdx, %rdi
> L(8x_return_vec_0_1_2_3):
> + /* L(loop_4x_vec) leaves result in `k1` for VEC_SIZE == 64. */
> +# if VEC_SIZE == 64
> + KMOV %k1, %VRCX
> +# endif
> addq %rdi, %rsi
> L(return_vec_0_1_2_3):
> - VPTEST %YMM1, %YMM1, %k0
> - kmovd %k0, %eax
> - testl %eax, %eax
> + VPTEST %VMM(1), %VMM(1), %k0
> + KMOV %k0, %VRAX
> + test %VRAX, %VRAX
> jnz L(return_vec_0)
>
> - VPTEST %YMM2, %YMM2, %k0
> - kmovd %k0, %eax
> - testl %eax, %eax
> + VPTEST %VMM(2), %VMM(2), %k0
> + KMOV %k0, %VRAX
> + test %VRAX, %VRAX
> jnz L(return_vec_1)
>
> - VPTEST %YMM3, %YMM3, %k0
> - kmovd %k0, %eax
> - testl %eax, %eax
> + VPTEST %VMM(3), %VMM(3), %k0
> + KMOV %k0, %VRAX
> + test %VRAX, %VRAX
> jnz L(return_vec_2)
> + .p2align 4,, 2
> L(return_vec_3):
> /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
> fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
> line. */
> - bsfl %ecx, %ecx
> + bsf %VRCX, %VRCX
> # ifdef USE_AS_WMEMCMP
> movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
> xorl %edx, %edx
> @@ -266,11 +285,11 @@ L(return_vec_3):
> ret
>
>
> - .p2align 4
> + .p2align 4,, 8
> L(return_vec_1):
> /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
> fetch block. */
> - bsfl %eax, %eax
> + bsf %VRAX, %VRAX
> # ifdef USE_AS_WMEMCMP
> movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
> xorl %edx, %edx
> @@ -284,11 +303,11 @@ L(return_vec_1):
> # endif
> ret
>
> - .p2align 4,, 10
> + .p2align 4,, 7
> L(return_vec_2):
> /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
> fetch block. */
> - bsfl %eax, %eax
> + bsf %VRAX, %VRAX
> # ifdef USE_AS_WMEMCMP
> movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
> xorl %edx, %edx
> @@ -302,7 +321,7 @@ L(return_vec_2):
> # endif
> ret
>
> - .p2align 4
> + .p2align 4,, 8
> L(more_8x_vec):
> /* Set end of s1 in rdx. */
> leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
> @@ -316,62 +335,82 @@ L(more_8x_vec):
>
> .p2align 4
> L(loop_4x_vec):
> - VMOVU (%rsi, %rdi), %YMM1
> - vpxorq (%rdi), %YMM1, %YMM1
> - VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
> - vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2
> - VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
> - vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> - VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
> - vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> - VPTEST %YMM4, %YMM4, %k1
> - kmovd %k1, %ecx
> - testl %ecx, %ecx
> + VMOVU (%rsi, %rdi), %VMM(1)
> + vpxorq (%rdi), %VMM(1), %VMM(1)
> + VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2)
> + vpxorq VEC_SIZE(%rdi), %VMM(2), %VMM(2)
> + VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
> + vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
> + VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
> + vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4)
> + vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
> + VPTEST %VMM(4), %VMM(4), %k1
> + /* If VEC_SIZE == 64 just branch with KTEST. We have free port0
> + space and it allows the loop to fit in 2x cache lines
> + instead of 3. */
> +# if VEC_SIZE == 64
> + KTEST %k1, %k1
> +# else
> + KMOV %k1, %VRCX
> + test %VRCX, %VRCX
> +# endif
> jnz L(8x_return_vec_0_1_2_3)
> subq $-(VEC_SIZE * 4), %rdi
> cmpq %rdx, %rdi
> jb L(loop_4x_vec)
> -
> subq %rdx, %rdi
> /* rdi has 4 * VEC_SIZE - remaining length. */
> cmpl $(VEC_SIZE * 3), %edi
> - jae L(8x_last_1x_vec)
> + jge L(8x_last_1x_vec)
> /* Load regardless of branch. */
> - VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
> - cmpl $(VEC_SIZE * 2), %edi
> - jae L(8x_last_2x_vec)
> + VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
>
> - vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
> -
> - VMOVU (%rsi, %rdx), %YMM1
> - vpxorq (%rdx), %YMM1, %YMM1
> + /* Seperate logic as we can only use testb for VEC_SIZE == 64.
> + */
> +# if VEC_SIZE == 64
> + testb %dil, %dil
> + js L(8x_last_2x_vec)
> +# else
> + cmpl $(VEC_SIZE * 2), %edi
> + jge L(8x_last_2x_vec)
> +# endif
>
> - VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
> - vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
> - VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
> - vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
> - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> - VPTEST %YMM4, %YMM4, %k1
> - kmovd %k1, %ecx
> - testl %ecx, %ecx
> + vpxorq (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(3)
> +
> + VMOVU (%rsi, %rdx), %VMM(1)
> + vpxorq (%rdx), %VMM(1), %VMM(1)
> +
> + VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2)
> + vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2)
> + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
> + vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %VMM(1), %VMM(4)
> + vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
> + VPTEST %VMM(4), %VMM(4), %k1
> + /* L(8x_end_return_vec_0_1_2_3) expects bitmask to still be in
> + `k1` if VEC_SIZE == 64. */
> +# if VEC_SIZE == 64
> + KTEST %k1, %k1
> +# else
> + KMOV %k1, %VRCX
> + test %VRCX, %VRCX
> +# endif
> jnz L(8x_end_return_vec_0_1_2_3)
> /* NB: eax must be zero to reach here. */
> ret
>
> /* Only entry is from L(more_8x_vec). */
> - .p2align 4,, 10
> + .p2align 4,, 6
> L(8x_last_2x_vec):
> - VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> + VPCMP $4, (VEC_SIZE * 2)(%rdx), %VMM(3), %k1
> + KMOV %k1, %VRAX
> + test %VRAX, %VRAX
> jnz L(8x_return_vec_2)
> - /* Naturally aligned to 16 bytes. */
> + .p2align 4,, 5
> L(8x_last_1x_vec):
> - VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
> - VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(1)
> + VPCMP $4, (VEC_SIZE * 3)(%rdx), %VMM(1), %k1
> + KMOV %k1, %VRAX
> + test %VRAX, %VRAX
> jnz L(8x_return_vec_3)
> ret
>
> @@ -383,7 +422,7 @@ L(8x_last_1x_vec):
> L(8x_return_vec_2):
> subq $VEC_SIZE, %rdx
> L(8x_return_vec_3):
> - bsfl %eax, %eax
> + bsf %VRAX, %VRAX
> # ifdef USE_AS_WMEMCMP
> leaq (%rdx, %rax, CHAR_SIZE), %rax
> movl (VEC_SIZE * 3)(%rax), %ecx
> @@ -399,32 +438,34 @@ L(8x_return_vec_3):
> # endif
> ret
>
> - .p2align 4,, 10
> + .p2align 4,, 8
> L(last_2x_vec):
> /* Check second to last VEC. */
> - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
> - VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> + VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1
> + KMOV %k1, %VRAX
> + test %VRAX, %VRAX
> jnz L(return_vec_1_end)
>
> /* Check last VEC. */
> - .p2align 4
> + .p2align 4,, 8
> L(last_1x_vec):
> - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
> - VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> + VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1
> + KMOV %k1, %VRAX
> + test %VRAX, %VRAX
> jnz L(return_vec_0_end)
> ret
>
>
> - /* Don't align. Takes 2-fetch blocks either way and aligning
> - will cause code to spill into another cacheline. */
> + /* Don't fully align. Takes 2-fetch blocks either way and
> + aligning will cause code to spill into another cacheline.
> + */
> + .p2align 4,, 3
> L(return_vec_1_end):
> /* Use bsf to save code size. This is necessary to have
> L(one_or_less) fit in aligning bytes between. */
> - bsfl %eax, %eax
> + bsf %VRAX, %VRAX
> addl %edx, %eax
> # ifdef USE_AS_WMEMCMP
> movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
> @@ -439,10 +480,11 @@ L(return_vec_1_end):
> # endif
> ret
>
> + .p2align 4,, 2
> /* Don't align. Takes 2-fetch blocks either way and aligning
> will cause code to spill into another cacheline. */
> L(return_vec_0_end):
> - tzcntl %eax, %eax
> + bsf %VRAX, %VRAX
> addl %edx, %eax
> # ifdef USE_AS_WMEMCMP
> movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
> @@ -456,7 +498,7 @@ L(return_vec_0_end):
> subl %ecx, %eax
> # endif
> ret
> - /* 1-byte until next cache line. */
> -
> + /* evex256: 2-byte until next cache line. evex512: 46-bytes
> + until next cache line. */
> END (MEMCMP)
> #endif
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S and minor changes
2022-10-29 20:19 ` [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S " Noah Goldstein
@ 2022-10-31 15:48 ` H.J. Lu
2022-10-31 16:42 ` Noah Goldstein
0 siblings, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2022-10-31 15:48 UTC (permalink / raw)
To: Noah Goldstein; +Cc: libc-alpha, carlos
On Sat, Oct 29, 2022 at 1:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Changes to generated code are:
> 1. In a few places use `vpcmpeqb` instead of `vpcmpneq` to save a
> byte of code size.
> 2. Add a branch for length <= (VEC_SIZE * 6) as opposed to doing
> the entire block of [VEC_SIZE * 4 + 1, VEC_SIZE * 8] in a
> single basic-block (the space to add the extra branch without
> changing code size is bought with the above change).
>
> Change (2) has roughly a 20-25% speedup for sizes in [VEC_SIZE * 4 +
> 1, VEC_SIZE * 6] and negligible to no-cost for [VEC_SIZE * 6 + 1,
> VEC_SIZE * 8]
>
> From N=10 runs on Tigerlake:
>
> align1,align2 ,length ,result ,New TIme ,Cur Time,New Time / Old Time
> 0 ,0 ,129 ,0 ,5.404 ,6.887 ,0.785
> 0 ,0 ,129 ,1 ,5.308 ,6.826 ,0.778
> 0 ,0 ,129 ,18446744073709551615 ,5.359 ,6.823 ,0.785
> 0 ,0 ,161 ,0 ,5.284 ,6.827 ,0.774
> 0 ,0 ,161 ,1 ,5.317 ,6.745 ,0.788
> 0 ,0 ,161 ,18446744073709551615 ,5.406 ,6.778 ,0.798
>
> 0 ,0 ,193 ,0 ,6.969 ,6.832 ,1.000
> 0 ,0 ,193 ,1 ,6.943 ,6.748 ,1.029
> 0 ,0 ,193 ,18446744073709551615 ,6.997 ,6.728 ,1.011
> 0 ,0 ,225 ,0 ,7.144 ,6.746 ,0.989
> 0 ,0 ,225 ,1 ,7.218 ,6.683 ,1.003
> 0 ,0 ,225 ,18446744073709551615 ,6.864 ,6.767 ,0.992
> 0 ,0 ,256 ,0 ,5.423 ,5.482 ,0.989
> 0 ,0 ,256 ,1 ,5.348 ,5.465 ,0.978
> 0 ,0 ,256 ,18446744073709551615 ,5.321 ,5.518 ,0.964
>
> Rewriting with VMM API allows for memcmpeq-evex to be used with
> evex512 by including "x86-evex512-vecs.h" at the top.
>
> Complete check passes on x86-64.
> ---
> sysdeps/x86_64/multiarch/memcmpeq-evex.S | 255 ++++++++++++++---------
> 1 file changed, 155 insertions(+), 100 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> index 41124ef1d3..671d19393e 100644
> --- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> +++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> @@ -41,24 +41,53 @@
> # define MEMCMPEQ __memcmpeq_evex
> # endif
>
> +# ifndef VEC_SIZE
> +# include "x86-evex512-vecs.h"
> +# endif
> +# include "reg-macros.h"
> +
> +
> +# if VEC_SIZE == 32
> +
> +# define TEST_ZERO_VCMP(reg) inc %VGPR(reg)
> +# define TEST_ZERO(reg) test %VGPR(reg), %VGPR(reg)
> +
> +# define TO_32BIT_P1(reg) /* Do nothing. */
> +# define TO_32BIT_P2(reg) /* Do nothing. */
> +# define TO_32BIT(reg) /* Do nothing. */
> +
> +# define VEC_CMP VPCMPEQ
> +
> +# elif VEC_SIZE == 64
> +
> +# define TEST_ZERO_VCMP(reg) TEST_ZERO(reg)
> +# define TEST_ZERO(reg) neg %VGPR(reg)
> +
> +
> + /* VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit
> + int. We have two methods for this. If the mask with branched
> + on, we use `neg` for the branch then `sbb` to get the 32-bit
> + return. If the mask was no branched on, we just use
> + `popcntq`. */
> +# define TO_32BIT_P1(reg) TEST_ZERO(reg)
> +# define TO_32BIT_P2(reg) sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32)
> +# define TO_32BIT(reg) popcntq %reg, %reg
> +
> +# define VEC_CMP VPCMPNEQ
> +
> +# else
> +# error "Unsupported VEC_SIZE"
> +# endif
> +
> +
> # define VMOVU_MASK vmovdqu8
> -# define VMOVU vmovdqu64
> -# define VPCMP vpcmpub
> +# define VPCMPNEQ vpcmpneqb
> +# define VPCMPEQ vpcmpeqb
> # define VPTEST vptestmb
>
> -# define VEC_SIZE 32
> # define PAGE_SIZE 4096
>
> -# define YMM0 ymm16
> -# define YMM1 ymm17
> -# define YMM2 ymm18
> -# define YMM3 ymm19
> -# define YMM4 ymm20
> -# define YMM5 ymm21
> -# define YMM6 ymm22
> -
> -
> - .section .text.evex, "ax", @progbits
> + .section SECTION(.text), "ax", @progbits
> ENTRY_P2ALIGN (MEMCMPEQ, 6)
> # ifdef __ILP32__
> /* Clear the upper 32 bits. */
> @@ -69,47 +98,54 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
> ja L(more_1x_vec)
>
> /* Create mask of bytes that are guranteed to be valid because
> - of length (edx). Using masked movs allows us to skip checks for
> - page crosses/zero size. */
> - movl $-1, %ecx
> - bzhil %edx, %ecx, %ecx
> - kmovd %ecx, %k2
> + of length (edx). Using masked movs allows us to skip checks
> + for page crosses/zero size. */
> + mov $-1, %VRAX
> + bzhi %VRDX, %VRAX, %VRAX
> + /* NB: A `jz` might be useful here. Page-faults that are
> + invalidated by predicate execution (the evex mask) can be
> + very slow. The expectation is this is not the norm so and
> + "most" code will not regularly call 'memcmp' with length = 0
> + and memory that is not wired up. */
> + KMOV %VRAX, %k2
>
> /* Use masked loads as VEC_SIZE could page cross where length
> (edx) would not. */
> - VMOVU_MASK (%rsi), %YMM2{%k2}
> - VPCMP $4,(%rdi), %YMM2, %k1{%k2}
> - kmovd %k1, %eax
> + VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
> + VPCMPNEQ (%rdi), %VMM(2), %k1{%k2}
> + KMOV %k1, %VRAX
> + TO_32BIT (VRAX)
> ret
>
> -
> + .p2align 4,, 3
> L(last_1x_vec):
> - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
> - VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
> - kmovd %k1, %eax
> + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
> + VPCMPNEQ -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %k1
> + KMOV %k1, %VRAX
> + TO_32BIT_P1 (rax)
> L(return_neq0):
> + TO_32BIT_P2 (rax)
> ret
>
>
> -
> - .p2align 4
> + .p2align 4,, 12
> L(more_1x_vec):
> /* From VEC + 1 to 2 * VEC. */
> - VMOVU (%rsi), %YMM1
> + VMOVU (%rsi), %VMM(1)
> /* Use compare not equals to directly check for mismatch. */
> - VPCMP $4,(%rdi), %YMM1, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> + VPCMPNEQ (%rdi), %VMM(1), %k1
> + KMOV %k1, %VRAX
> + TEST_ZERO (rax)
> jnz L(return_neq0)
>
> cmpq $(VEC_SIZE * 2), %rdx
> jbe L(last_1x_vec)
>
> /* Check second VEC no matter what. */
> - VMOVU VEC_SIZE(%rsi), %YMM2
> - VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> + VMOVU VEC_SIZE(%rsi), %VMM(2)
> + VPCMPNEQ VEC_SIZE(%rdi), %VMM(2), %k1
> + KMOV %k1, %VRAX
> + TEST_ZERO (rax)
> jnz L(return_neq0)
>
> /* Less than 4 * VEC. */
> @@ -117,16 +153,16 @@ L(more_1x_vec):
> jbe L(last_2x_vec)
>
> /* Check third and fourth VEC no matter what. */
> - VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
> - VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> + VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3)
> + VEC_CMP (VEC_SIZE * 2)(%rdi), %VMM(3), %k1
> + KMOV %k1, %VRAX
> + TEST_ZERO_VCMP (rax)
> jnz L(return_neq0)
>
> - VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
> - VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> + VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4)
> + VEC_CMP (VEC_SIZE * 3)(%rdi), %VMM(4), %k1
> + KMOV %k1, %VRAX
> + TEST_ZERO_VCMP (rax)
> jnz L(return_neq0)
>
> /* Go to 4x VEC loop. */
> @@ -136,8 +172,8 @@ L(more_1x_vec):
> /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
> branches. */
>
> - VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %YMM1
> - VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %YMM2
> + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
> + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
> addq %rdx, %rdi
>
> /* Wait to load from s1 until addressed adjust due to
> @@ -145,26 +181,32 @@ L(more_1x_vec):
>
> /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
> will have some 1s. */
> - vpxorq -(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1
> - /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while
> - oring with YMM1. Result is stored in YMM1. */
> - vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2
> -
> - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
> - vpxorq -(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> - /* Or together YMM1, YMM2, and YMM3 into YMM3. */
> - VMOVU -(VEC_SIZE)(%rsi, %rdx), %YMM4
> - vpxorq -(VEC_SIZE)(%rdi), %YMM4, %YMM4
> -
> - /* Or together YMM2, YMM3, and YMM4 into YMM4. */
> - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> -
> - /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
> - VPTEST %YMM4, %YMM4, %k1
> - kmovd %k1, %eax
> + vpxorq -(VEC_SIZE * 1)(%rdi), %VMM(1), %VMM(1)
> + /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while
> + oring with VEC(1). Result is stored in VEC(1). */
> + vpternlogd $0xde, -(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(2)
> +
> + cmpl $(VEC_SIZE * 6), %edx
> + jbe L(4x_last_2x_vec)
> +
> + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(3)
> + vpxorq -(VEC_SIZE * 3)(%rdi), %VMM(3), %VMM(3)
> + /* Or together VEC(1), VEC(2), and VEC(3) into VEC(3). */
> + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(4)
> + vpxorq -(VEC_SIZE * 4)(%rdi), %VMM(4), %VMM(4)
> +
> + /* Or together VEC(4), VEC(3), and VEC(2) into VEC(2). */
> + vpternlogd $0xfe, %VMM(4), %VMM(3), %VMM(2)
> +
> + /* Compare VEC(4) with 0. If any 1s s1 and s2 don't match. */
> +L(4x_last_2x_vec):
> + VPTEST %VMM(2), %VMM(2), %k1
> + KMOV %k1, %VRAX
> + TO_32BIT (VRAX)
> ret
>
> - .p2align 4
> +
> + .p2align 4,, 10
> L(more_8x_vec):
> /* Set end of s1 in rdx. */
> leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
> @@ -175,67 +217,80 @@ L(more_8x_vec):
> andq $-VEC_SIZE, %rdi
> /* Adjust because first 4x vec where check already. */
> subq $-(VEC_SIZE * 4), %rdi
> - .p2align 4
> + .p2align 5,, 12
> + .p2align 4,, 8
> L(loop_4x_vec):
> - VMOVU (%rsi, %rdi), %YMM1
> - vpxorq (%rdi), %YMM1, %YMM1
> + VMOVU (%rsi, %rdi), %VMM(1)
> + vpxorq (%rdi), %VMM(1), %VMM(1)
>
> - VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
> - vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
> + VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2)
> + vpternlogd $0xde, (VEC_SIZE)(%rdi), %VMM(1), %VMM(2)
>
> - VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
> - vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> + VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
> + vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
>
> - VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
> - vpxorq (VEC_SIZE * 3)(%rdi), %YMM4, %YMM4
> + VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
> + vpxorq (VEC_SIZE * 3)(%rdi), %VMM(4), %VMM(4)
>
> - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> - VPTEST %YMM4, %YMM4, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> + vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
> + VPTEST %VMM(4), %VMM(4), %k1
> + KMOV %k1, %VRAX
> + TEST_ZERO (rax)
> jnz L(return_neq2)
> subq $-(VEC_SIZE * 4), %rdi
> cmpq %rdx, %rdi
> jb L(loop_4x_vec)
>
> subq %rdx, %rdi
> - VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
> - vpxorq (VEC_SIZE * 3)(%rdx), %YMM4, %YMM4
> +
> + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
> + vpxorq (VEC_SIZE * 3)(%rdx), %VMM(4), %VMM(4)
> /* rdi has 4 * VEC_SIZE - remaining length. */
> - cmpl $(VEC_SIZE * 3), %edi
> - jae L(8x_last_1x_vec)
> +
> /* Load regardless of branch. */
> - VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
> - /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
> - oring with YMM4. Result is stored in YMM4. */
> - vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
> + VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
> + /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while
> + oring with VEC(4). Result is stored in VEC(4). */
> + vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4)
> +
> + /* Seperate logic as we can only use testb for VEC_SIZE == 64.
> + */
> +# if VEC_SIZE == 64
> + testb %dil, %dil
> + js L(8x_last_2x_vec)
> +# else
> cmpl $(VEC_SIZE * 2), %edi
> - jae L(8x_last_2x_vec)
> + jge L(8x_last_2x_vec)
> +# endif
>
> - VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
> - vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
> + VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2)
> + vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2)
>
> - VMOVU (%rsi, %rdx), %YMM1
> - vpxorq (%rdx), %YMM1, %YMM1
> + VMOVU (%rsi, %rdx), %VMM(1)
> + vpxorq (%rdx), %VMM(1), %VMM(1)
>
> - vpternlogd $0xfe, %YMM1, %YMM2, %YMM4
> + vpternlogd $0xfe, %VMM(1), %VMM(2), %VMM(4)
> L(8x_last_1x_vec):
> L(8x_last_2x_vec):
> - VPTEST %YMM4, %YMM4, %k1
> - kmovd %k1, %eax
> + VPTEST %VMM(4), %VMM(4), %k1
> + KMOV %k1, %VRAX
> + TO_32BIT_P1 (rax)
> L(return_neq2):
> + TO_32BIT_P2 (rax)
> ret
>
> - .p2align 4,, 8
> + .p2align 4,, 4
> L(last_2x_vec):
> - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
> - vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
> - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
> - vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
> - VPTEST %YMM2, %YMM2, %k1
> - kmovd %k1, %eax
> + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(1)
> + vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %VMM(1), %VMM(1)
> + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2)
> + vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %VMM(2)
> + VPTEST %VMM(2), %VMM(2), %k1
> + KMOV %k1, %VRAX
> + TO_32BIT (VRAX)
> ret
>
> - /* 1 Bytes from next cache line. */
> + /* evex256: 1 Bytes from next cache line. evex512: 15 Bytes from
> + next cache line. */
> END (MEMCMPEQ)
> #endif
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S and minor changes
2022-10-31 15:48 ` H.J. Lu
@ 2022-10-31 16:42 ` Noah Goldstein
0 siblings, 0 replies; 7+ messages in thread
From: Noah Goldstein @ 2022-10-31 16:42 UTC (permalink / raw)
To: H.J. Lu; +Cc: libc-alpha, carlos
On Mon, Oct 31, 2022 at 10:48 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sat, Oct 29, 2022 at 1:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Changes to generated code are:
> > 1. In a few places use `vpcmpeqb` instead of `vpcmpneq` to save a
> > byte of code size.
> > 2. Add a branch for length <= (VEC_SIZE * 6) as opposed to doing
> > the entire block of [VEC_SIZE * 4 + 1, VEC_SIZE * 8] in a
> > single basic-block (the space to add the extra branch without
> > changing code size is bought with the above change).
> >
> > Change (2) has roughly a 20-25% speedup for sizes in [VEC_SIZE * 4 +
> > 1, VEC_SIZE * 6] and negligible to no-cost for [VEC_SIZE * 6 + 1,
> > VEC_SIZE * 8]
> >
> > From N=10 runs on Tigerlake:
> >
> > align1,align2 ,length ,result ,New TIme ,Cur Time,New Time / Old Time
> > 0 ,0 ,129 ,0 ,5.404 ,6.887 ,0.785
> > 0 ,0 ,129 ,1 ,5.308 ,6.826 ,0.778
> > 0 ,0 ,129 ,18446744073709551615 ,5.359 ,6.823 ,0.785
> > 0 ,0 ,161 ,0 ,5.284 ,6.827 ,0.774
> > 0 ,0 ,161 ,1 ,5.317 ,6.745 ,0.788
> > 0 ,0 ,161 ,18446744073709551615 ,5.406 ,6.778 ,0.798
> >
> > 0 ,0 ,193 ,0 ,6.969 ,6.832 ,1.000
> > 0 ,0 ,193 ,1 ,6.943 ,6.748 ,1.029
> > 0 ,0 ,193 ,18446744073709551615 ,6.997 ,6.728 ,1.011
> > 0 ,0 ,225 ,0 ,7.144 ,6.746 ,0.989
> > 0 ,0 ,225 ,1 ,7.218 ,6.683 ,1.003
> > 0 ,0 ,225 ,18446744073709551615 ,6.864 ,6.767 ,0.992
> > 0 ,0 ,256 ,0 ,5.423 ,5.482 ,0.989
> > 0 ,0 ,256 ,1 ,5.348 ,5.465 ,0.978
> > 0 ,0 ,256 ,18446744073709551615 ,5.321 ,5.518 ,0.964
> >
> > Rewriting with VMM API allows for memcmpeq-evex to be used with
> > evex512 by including "x86-evex512-vecs.h" at the top.
> >
> > Complete check passes on x86-64.
> > ---
> > sysdeps/x86_64/multiarch/memcmpeq-evex.S | 255 ++++++++++++++---------
> > 1 file changed, 155 insertions(+), 100 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> > index 41124ef1d3..671d19393e 100644
> > --- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> > +++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> > @@ -41,24 +41,53 @@
> > # define MEMCMPEQ __memcmpeq_evex
> > # endif
> >
> > +# ifndef VEC_SIZE
> > +# include "x86-evex512-vecs.h"
> > +# endif
> > +# include "reg-macros.h"
> > +
> > +
> > +# if VEC_SIZE == 32
> > +
> > +# define TEST_ZERO_VCMP(reg) inc %VGPR(reg)
> > +# define TEST_ZERO(reg) test %VGPR(reg), %VGPR(reg)
> > +
> > +# define TO_32BIT_P1(reg) /* Do nothing. */
> > +# define TO_32BIT_P2(reg) /* Do nothing. */
> > +# define TO_32BIT(reg) /* Do nothing. */
> > +
> > +# define VEC_CMP VPCMPEQ
> > +
> > +# elif VEC_SIZE == 64
> > +
> > +# define TEST_ZERO_VCMP(reg) TEST_ZERO(reg)
> > +# define TEST_ZERO(reg) neg %VGPR(reg)
> > +
> > +
> > + /* VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit
> > + int. We have two methods for this. If the mask with branched
> > + on, we use `neg` for the branch then `sbb` to get the 32-bit
> > + return. If the mask was no branched on, we just use
> > + `popcntq`. */
> > +# define TO_32BIT_P1(reg) TEST_ZERO(reg)
> > +# define TO_32BIT_P2(reg) sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32)
> > +# define TO_32BIT(reg) popcntq %reg, %reg
> > +
> > +# define VEC_CMP VPCMPNEQ
> > +
> > +# else
> > +# error "Unsupported VEC_SIZE"
> > +# endif
> > +
> > +
> > # define VMOVU_MASK vmovdqu8
> > -# define VMOVU vmovdqu64
> > -# define VPCMP vpcmpub
> > +# define VPCMPNEQ vpcmpneqb
> > +# define VPCMPEQ vpcmpeqb
> > # define VPTEST vptestmb
> >
> > -# define VEC_SIZE 32
> > # define PAGE_SIZE 4096
> >
> > -# define YMM0 ymm16
> > -# define YMM1 ymm17
> > -# define YMM2 ymm18
> > -# define YMM3 ymm19
> > -# define YMM4 ymm20
> > -# define YMM5 ymm21
> > -# define YMM6 ymm22
> > -
> > -
> > - .section .text.evex, "ax", @progbits
> > + .section SECTION(.text), "ax", @progbits
> > ENTRY_P2ALIGN (MEMCMPEQ, 6)
> > # ifdef __ILP32__
> > /* Clear the upper 32 bits. */
> > @@ -69,47 +98,54 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
> > ja L(more_1x_vec)
> >
> > /* Create mask of bytes that are guranteed to be valid because
> > - of length (edx). Using masked movs allows us to skip checks for
> > - page crosses/zero size. */
> > - movl $-1, %ecx
> > - bzhil %edx, %ecx, %ecx
> > - kmovd %ecx, %k2
> > + of length (edx). Using masked movs allows us to skip checks
> > + for page crosses/zero size. */
> > + mov $-1, %VRAX
> > + bzhi %VRDX, %VRAX, %VRAX
> > + /* NB: A `jz` might be useful here. Page-faults that are
> > + invalidated by predicate execution (the evex mask) can be
> > + very slow. The expectation is this is not the norm so and
> > + "most" code will not regularly call 'memcmp' with length = 0
> > + and memory that is not wired up. */
> > + KMOV %VRAX, %k2
> >
> > /* Use masked loads as VEC_SIZE could page cross where length
> > (edx) would not. */
> > - VMOVU_MASK (%rsi), %YMM2{%k2}
> > - VPCMP $4,(%rdi), %YMM2, %k1{%k2}
> > - kmovd %k1, %eax
> > + VMOVU_MASK (%rsi), %VMM(2){%k2}{z}
> > + VPCMPNEQ (%rdi), %VMM(2), %k1{%k2}
> > + KMOV %k1, %VRAX
> > + TO_32BIT (VRAX)
> > ret
> >
> > -
> > + .p2align 4,, 3
> > L(last_1x_vec):
> > - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
> > - VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
> > - kmovd %k1, %eax
> > + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
> > + VPCMPNEQ -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %k1
> > + KMOV %k1, %VRAX
> > + TO_32BIT_P1 (rax)
> > L(return_neq0):
> > + TO_32BIT_P2 (rax)
> > ret
> >
> >
> > -
> > - .p2align 4
> > + .p2align 4,, 12
> > L(more_1x_vec):
> > /* From VEC + 1 to 2 * VEC. */
> > - VMOVU (%rsi), %YMM1
> > + VMOVU (%rsi), %VMM(1)
> > /* Use compare not equals to directly check for mismatch. */
> > - VPCMP $4,(%rdi), %YMM1, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > + VPCMPNEQ (%rdi), %VMM(1), %k1
> > + KMOV %k1, %VRAX
> > + TEST_ZERO (rax)
> > jnz L(return_neq0)
> >
> > cmpq $(VEC_SIZE * 2), %rdx
> > jbe L(last_1x_vec)
> >
> > /* Check second VEC no matter what. */
> > - VMOVU VEC_SIZE(%rsi), %YMM2
> > - VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > + VMOVU VEC_SIZE(%rsi), %VMM(2)
> > + VPCMPNEQ VEC_SIZE(%rdi), %VMM(2), %k1
> > + KMOV %k1, %VRAX
> > + TEST_ZERO (rax)
> > jnz L(return_neq0)
> >
> > /* Less than 4 * VEC. */
> > @@ -117,16 +153,16 @@ L(more_1x_vec):
> > jbe L(last_2x_vec)
> >
> > /* Check third and fourth VEC no matter what. */
> > - VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
> > - VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > + VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3)
> > + VEC_CMP (VEC_SIZE * 2)(%rdi), %VMM(3), %k1
> > + KMOV %k1, %VRAX
> > + TEST_ZERO_VCMP (rax)
> > jnz L(return_neq0)
> >
> > - VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
> > - VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > + VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4)
> > + VEC_CMP (VEC_SIZE * 3)(%rdi), %VMM(4), %k1
> > + KMOV %k1, %VRAX
> > + TEST_ZERO_VCMP (rax)
> > jnz L(return_neq0)
> >
> > /* Go to 4x VEC loop. */
> > @@ -136,8 +172,8 @@ L(more_1x_vec):
> > /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
> > branches. */
> >
> > - VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %YMM1
> > - VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %YMM2
> > + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1)
> > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2)
> > addq %rdx, %rdi
> >
> > /* Wait to load from s1 until addressed adjust due to
> > @@ -145,26 +181,32 @@ L(more_1x_vec):
> >
> > /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
> > will have some 1s. */
> > - vpxorq -(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1
> > - /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while
> > - oring with YMM1. Result is stored in YMM1. */
> > - vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2
> > -
> > - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
> > - vpxorq -(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> > - /* Or together YMM1, YMM2, and YMM3 into YMM3. */
> > - VMOVU -(VEC_SIZE)(%rsi, %rdx), %YMM4
> > - vpxorq -(VEC_SIZE)(%rdi), %YMM4, %YMM4
> > -
> > - /* Or together YMM2, YMM3, and YMM4 into YMM4. */
> > - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> > -
> > - /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
> > - VPTEST %YMM4, %YMM4, %k1
> > - kmovd %k1, %eax
> > + vpxorq -(VEC_SIZE * 1)(%rdi), %VMM(1), %VMM(1)
> > + /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while
> > + oring with VEC(1). Result is stored in VEC(1). */
> > + vpternlogd $0xde, -(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(2)
> > +
> > + cmpl $(VEC_SIZE * 6), %edx
> > + jbe L(4x_last_2x_vec)
> > +
> > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(3)
> > + vpxorq -(VEC_SIZE * 3)(%rdi), %VMM(3), %VMM(3)
> > + /* Or together VEC(1), VEC(2), and VEC(3) into VEC(3). */
> > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(4)
> > + vpxorq -(VEC_SIZE * 4)(%rdi), %VMM(4), %VMM(4)
> > +
> > + /* Or together VEC(4), VEC(3), and VEC(2) into VEC(2). */
> > + vpternlogd $0xfe, %VMM(4), %VMM(3), %VMM(2)
> > +
> > + /* Compare VEC(4) with 0. If any 1s s1 and s2 don't match. */
> > +L(4x_last_2x_vec):
> > + VPTEST %VMM(2), %VMM(2), %k1
> > + KMOV %k1, %VRAX
> > + TO_32BIT (VRAX)
> > ret
> >
> > - .p2align 4
> > +
> > + .p2align 4,, 10
> > L(more_8x_vec):
> > /* Set end of s1 in rdx. */
> > leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
> > @@ -175,67 +217,80 @@ L(more_8x_vec):
> > andq $-VEC_SIZE, %rdi
> > /* Adjust because first 4x vec where check already. */
> > subq $-(VEC_SIZE * 4), %rdi
> > - .p2align 4
> > + .p2align 5,, 12
> > + .p2align 4,, 8
> > L(loop_4x_vec):
> > - VMOVU (%rsi, %rdi), %YMM1
> > - vpxorq (%rdi), %YMM1, %YMM1
> > + VMOVU (%rsi, %rdi), %VMM(1)
> > + vpxorq (%rdi), %VMM(1), %VMM(1)
> >
> > - VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
> > - vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
> > + VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2)
> > + vpternlogd $0xde, (VEC_SIZE)(%rdi), %VMM(1), %VMM(2)
> >
> > - VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
> > - vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> > + VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3)
> > + vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3)
> >
> > - VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
> > - vpxorq (VEC_SIZE * 3)(%rdi), %YMM4, %YMM4
> > + VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4)
> > + vpxorq (VEC_SIZE * 3)(%rdi), %VMM(4), %VMM(4)
> >
> > - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> > - VPTEST %YMM4, %YMM4, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > + vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4)
> > + VPTEST %VMM(4), %VMM(4), %k1
> > + KMOV %k1, %VRAX
> > + TEST_ZERO (rax)
> > jnz L(return_neq2)
> > subq $-(VEC_SIZE * 4), %rdi
> > cmpq %rdx, %rdi
> > jb L(loop_4x_vec)
> >
> > subq %rdx, %rdi
> > - VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
> > - vpxorq (VEC_SIZE * 3)(%rdx), %YMM4, %YMM4
> > +
> > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4)
> > + vpxorq (VEC_SIZE * 3)(%rdx), %VMM(4), %VMM(4)
> > /* rdi has 4 * VEC_SIZE - remaining length. */
> > - cmpl $(VEC_SIZE * 3), %edi
> > - jae L(8x_last_1x_vec)
> > +
> > /* Load regardless of branch. */
> > - VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
> > - /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
> > - oring with YMM4. Result is stored in YMM4. */
> > - vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
> > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3)
> > + /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while
> > + oring with VEC(4). Result is stored in VEC(4). */
> > + vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4)
> > +
> > + /* Seperate logic as we can only use testb for VEC_SIZE == 64.
> > + */
> > +# if VEC_SIZE == 64
> > + testb %dil, %dil
> > + js L(8x_last_2x_vec)
> > +# else
> > cmpl $(VEC_SIZE * 2), %edi
> > - jae L(8x_last_2x_vec)
> > + jge L(8x_last_2x_vec)
> > +# endif
> >
> > - VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
> > - vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
> > + VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2)
> > + vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2)
> >
> > - VMOVU (%rsi, %rdx), %YMM1
> > - vpxorq (%rdx), %YMM1, %YMM1
> > + VMOVU (%rsi, %rdx), %VMM(1)
> > + vpxorq (%rdx), %VMM(1), %VMM(1)
> >
> > - vpternlogd $0xfe, %YMM1, %YMM2, %YMM4
> > + vpternlogd $0xfe, %VMM(1), %VMM(2), %VMM(4)
> > L(8x_last_1x_vec):
> > L(8x_last_2x_vec):
> > - VPTEST %YMM4, %YMM4, %k1
> > - kmovd %k1, %eax
> > + VPTEST %VMM(4), %VMM(4), %k1
> > + KMOV %k1, %VRAX
> > + TO_32BIT_P1 (rax)
> > L(return_neq2):
> > + TO_32BIT_P2 (rax)
> > ret
> >
> > - .p2align 4,, 8
> > + .p2align 4,, 4
> > L(last_2x_vec):
> > - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
> > - vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
> > - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
> > - vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
> > - VPTEST %YMM2, %YMM2, %k1
> > - kmovd %k1, %eax
> > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(1)
> > + vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %VMM(1), %VMM(1)
> > + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2)
> > + vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %VMM(2)
> > + VPTEST %VMM(2), %VMM(2), %k1
> > + KMOV %k1, %VRAX
> > + TO_32BIT (VRAX)
> > ret
> >
> > - /* 1 Bytes from next cache line. */
> > + /* evex256: 1 Bytes from next cache line. evex512: 15 Bytes from
> > + next cache line. */
> > END (MEMCMPEQ)
> > #endif
> > --
> > 2.34.1
> >
>
> LGTM.
>
> Thanks.
>
> --
> H.J.
Err realized I had messed up my extraction script when consolidating these
numbers (ratios are correct, had pulled in the wrong fields when grabbing
the actual times).
Correct numbers are:
align1,align2 ,length ,result ,memcmpeq-v2 ,current
,memcmpeq-v2/current
0 ,0 ,129 ,0 ,5.404 ,6.887 ,0.785
0 ,0 ,129 ,1 ,5.308 ,6.826 ,0.778
0 ,0 ,129 ,18446744073709551615 ,5.359 ,6.823 ,0.785
0 ,0 ,161 ,0 ,5.284 ,6.827 ,0.774
0 ,0 ,161 ,1 ,5.317 ,6.745 ,0.788
0 ,0 ,161 ,18446744073709551615 ,5.406 ,6.778 ,0.798
0 ,0 ,193 ,0 ,6.804 ,6.802 ,1.000
0 ,0 ,193 ,1 ,6.950 ,6.754 ,1.029
0 ,0 ,193 ,18446744073709551615 ,6.792 ,6.719 ,1.011
0 ,0 ,225 ,0 ,6.625 ,6.699 ,0.989
0 ,0 ,225 ,1 ,6.776 ,6.735 ,1.003
0 ,0 ,225 ,18446744073709551615 ,6.758 ,6.738 ,0.992
0 ,0 ,256 ,0 ,5.402 ,5.462
,0.989
0 ,0 ,256 ,1 ,5.364 ,5.483
,0.978
0 ,0 ,256 ,18446744073709551615 ,5.341 ,5.539 ,0.964
Will update the commit message before pushing.
^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2022-10-31 16:42 UTC | newest]
Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-29 20:19 [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Noah Goldstein
2022-10-29 20:19 ` [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes Noah Goldstein
2022-10-31 15:47 ` H.J. Lu
2022-10-29 20:19 ` [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S " Noah Goldstein
2022-10-31 15:48 ` H.J. Lu
2022-10-31 16:42 ` Noah Goldstein
2022-10-31 13:19 ` [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Siddhesh Poyarekar
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).