* [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests @ 2022-10-29 20:19 Noah Goldstein 2022-10-29 20:19 ` [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes Noah Goldstein ` (2 more replies) 0 siblings, 3 replies; 7+ messages in thread From: Noah Goldstein @ 2022-10-29 20:19 UTC (permalink / raw) To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos len=0 is valid and fairly common so should be tested. --- benchtests/bench-memcmp.c | 18 +++++++++--------- string/test-memcmp.c | 16 ++++++++++------ 2 files changed, 19 insertions(+), 15 deletions(-) diff --git a/benchtests/bench-memcmp.c b/benchtests/bench-memcmp.c index d64eaa992e..b2816baebe 100644 --- a/benchtests/bench-memcmp.c +++ b/benchtests/bench-memcmp.c @@ -63,7 +63,7 @@ IMPL (MEMCMP, 1) static void do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s1, - const CHAR *s2, size_t len, int exp_result) + const CHAR *s2, size_t len) { size_t i, iters = INNER_LOOP_ITERS_LARGE; timing_t start, stop, cur; @@ -87,9 +87,6 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len, size_t i; CHAR *s1, *s2; - if (len == 0) - return; - align1 &= (4096 - CHARBYTES); if (align1 + (len + 1) * CHARBYTES >= page_size) return; @@ -111,13 +108,16 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len, for (i = 0; i < len; i++) s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % MAX_CHAR; - s1[len] = align1; - s2[len] = align2; - s2[len - 1] -= exp_result; + if (len) + { + s1[len] = align1; + s2[len] = align2; + s2[len - 1] -= exp_result; + } FOR_EACH_IMPL (impl, 0) { - do_one_test (json_ctx, impl, s1, s2, len, exp_result); + do_one_test (json_ctx, impl, s1, s2, len); } json_array_end (json_ctx); @@ -147,7 +147,7 @@ test_main (void) json_array_end (&json_ctx); json_array_begin (&json_ctx, "results"); - for (i = 1; i < 32; ++i) + for (i = 0; i < 32; ++i) { do_test (&json_ctx, i * CHARBYTES, i * CHARBYTES, i, 0); do_test (&json_ctx, i * CHARBYTES, i * CHARBYTES, i, 1); diff --git a/string/test-memcmp.c b/string/test-memcmp.c index 181b689f68..18d8b0d9f1 100644 --- a/string/test-memcmp.c +++ b/string/test-memcmp.c @@ -117,9 +117,6 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result) size_t i; CHAR *s1, *s2; - if (len == 0) - return; - align1 &= (4096 - CHARBYTES); if (align1 + (len + 1) * CHARBYTES >= page_size) return; @@ -134,9 +131,16 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result) for (i = 0; i < len; i++) s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % CHAR__MAX; - s1[len] = align1; - s2[len] = align2; - s2[len - 1] -= exp_result; + if (len) + { + s1[len] = align1; + s2[len] = align2; + s2[len - 1] -= exp_result; + } + else + { + exp_result = 0; + } FOR_EACH_IMPL (impl, 0) do_one_test (impl, s1, s2, len, exp_result); -- 2.34.1 ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes 2022-10-29 20:19 [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Noah Goldstein @ 2022-10-29 20:19 ` Noah Goldstein 2022-10-31 15:47 ` H.J. Lu 2022-10-29 20:19 ` [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S " Noah Goldstein 2022-10-31 13:19 ` [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Siddhesh Poyarekar 2 siblings, 1 reply; 7+ messages in thread From: Noah Goldstein @ 2022-10-29 20:19 UTC (permalink / raw) To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos The only change to the existing generated code is `tzcnt` -> `bsf` to save a byte of code size here and there. Rewriting with VMM API allows for memcmp-evex-movbe to be used with evex512 by including "x86-evex512-vecs.h" at the top. Complete check passes on x86-64. --- sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 308 +++++++++++-------- 1 file changed, 175 insertions(+), 133 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S index bc017768be..f6c379831e 100644 --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S @@ -62,44 +62,38 @@ Latency: # define MEMCMP __memcmp_evex_movbe # endif -# define VMOVU vmovdqu64 +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" +# endif # ifdef USE_AS_WMEMCMP # define VMOVU_MASK vmovdqu32 # define CHAR_SIZE 4 # define VPCMP vpcmpd +# define VPCMPEQ vpcmpeqd # define VPTEST vptestmd + +# define USE_WIDE_CHAR # else # define VMOVU_MASK vmovdqu8 # define CHAR_SIZE 1 # define VPCMP vpcmpub +# define VPCMPEQ vpcmpeqb # define VPTEST vptestmb # endif +# include "reg-macros.h" -# define VEC_SIZE 32 # define PAGE_SIZE 4096 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) -# define XMM0 xmm16 -# define XMM1 xmm17 -# define XMM2 xmm18 -# define YMM0 ymm16 -# define XMM1 xmm17 -# define XMM2 xmm18 -# define YMM1 ymm17 -# define YMM2 ymm18 -# define YMM3 ymm19 -# define YMM4 ymm20 -# define YMM5 ymm21 -# define YMM6 ymm22 /* Warning! wmemcmp has to use SIGNED comparison for elements. memcmp has to use UNSIGNED comparison for elemnts. */ - .section .text.evex,"ax",@progbits + .section SECTION(.text), "ax", @progbits /* Cache align memcmp entry. This allows for much more thorough frontend optimization. */ ENTRY_P2ALIGN (MEMCMP, 6) @@ -111,23 +105,40 @@ ENTRY_P2ALIGN (MEMCMP, 6) /* Fall through for [0, VEC_SIZE] as its the hottest. */ ja L(more_1x_vec) - /* Create mask for CHAR's we want to compare. This allows us to - avoid having to include page cross logic. */ - movl $-1, %ecx - bzhil %edx, %ecx, %ecx - kmovd %ecx, %k2 + /* Create mask of bytes that are guranteed to be valid because + of length (edx). Using masked movs allows us to skip checks + for page crosses/zero size. */ + mov $-1, %VRAX + bzhi %VRDX, %VRAX, %VRAX + /* NB: A `jz` might be useful here. Page-faults that are + invalidated by predicate execution (the evex mask) can be + very slow. The expectation is this is not the norm so and + "most" code will not regularly call 'memcmp' with length = 0 + and memory that is not wired up. */ + KMOV %VRAX, %k2 + + /* Safe to load full ymm with mask. */ - VMOVU_MASK (%rsi), %YMM2{%k2} - VPCMP $4,(%rdi), %YMM2, %k1{%k2} - kmovd %k1, %eax - testl %eax, %eax + VMOVU_MASK (%rsi), %VMM(2){%k2}{z} + /* Slightly different method for VEC_SIZE == 64 to save a bit of + code size. This allows us to fit L(return_vec_0) entirely in + the first cache line. */ +# if VEC_SIZE == 64 + VPCMPEQ (%rdi), %VMM(2), %k1{%k2} + KMOV %k1, %VRCX + sub %VRCX, %VRAX +# else + VPCMP $4, (%rdi), %VMM(2), %k1{%k2} + KMOV %k1, %VRAX + test %VRAX, %VRAX +# endif jnz L(return_vec_0) ret - .p2align 4 + .p2align 4,, 11 L(return_vec_0): - tzcntl %eax, %eax + bsf %VRAX, %VRAX # ifdef USE_AS_WMEMCMP movl (%rdi, %rax, CHAR_SIZE), %ecx xorl %edx, %edx @@ -138,33 +149,36 @@ L(return_vec_0): leal -1(%rdx, %rdx), %eax # else movzbl (%rsi, %rax), %ecx +# if VEC_SIZE == 64 + movb (%rdi, %rax), %al +# else movzbl (%rdi, %rax), %eax +# endif subl %ecx, %eax # endif ret - - .p2align 4 + .p2align 4,, 11 L(more_1x_vec): /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ - VMOVU (%rsi), %YMM1 + VMOVU (%rsi), %VMM(1) /* Use compare not equals to directly check for mismatch. */ - VPCMP $4,(%rdi), %YMM1, %k1 - kmovd %k1, %eax + VPCMP $4, (%rdi), %VMM(1), %k1 + KMOV %k1, %VRAX /* NB: eax must be destination register if going to - L(return_vec_[0,2]). For L(return_vec_3) destination register - must be ecx. */ - testl %eax, %eax + L(return_vec_[0,2]). For L(return_vec_3) destination + register must be ecx. */ + test %VRAX, %VRAX jnz L(return_vec_0) cmpq $(CHAR_PER_VEC * 2), %rdx jbe L(last_1x_vec) /* Check second VEC no matter what. */ - VMOVU VEC_SIZE(%rsi), %YMM2 - VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1 - kmovd %k1, %eax - testl %eax, %eax + VMOVU VEC_SIZE(%rsi), %VMM(2) + VPCMP $4, VEC_SIZE(%rdi), %VMM(2), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX jnz L(return_vec_1) /* Less than 4 * VEC. */ @@ -172,16 +186,16 @@ L(more_1x_vec): jbe L(last_2x_vec) /* Check third and fourth VEC no matter what. */ - VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 - VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1 - kmovd %k1, %eax - testl %eax, %eax + VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3) + VPCMP $4, (VEC_SIZE * 2)(%rdi), %VMM(3), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX jnz L(return_vec_2) - VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 - VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1 - kmovd %k1, %ecx - testl %ecx, %ecx + VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4) + VPCMP $4, (VEC_SIZE * 3)(%rdi), %VMM(4), %k1 + KMOV %k1, %VRCX + test %VRCX, %VRCX jnz L(return_vec_3) /* Go to 4x VEC loop. */ @@ -192,8 +206,8 @@ L(more_1x_vec): branches. */ /* Load first two VEC from s2 before adjusting addresses. */ - VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1 - VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2 + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %VMM(1) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(2) leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi @@ -202,56 +216,61 @@ L(more_1x_vec): /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it will have some 1s. */ - vpxorq (%rdi), %YMM1, %YMM1 - vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2 + vpxorq (%rdi), %VMM(1), %VMM(1) + vpxorq (VEC_SIZE)(%rdi), %VMM(2), %VMM(2) - VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 - vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 + VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3) + vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3) - VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 - /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while - oring with YMM1. Result is stored in YMM4. */ - vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 + VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4) + /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with VEC(4) while + oring with VEC(1). Result is stored in VEC(4). */ + vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4) - /* Or together YMM2, YMM3, and YMM4 into YMM4. */ - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 + /* Or together VEC(2), VEC(3), and VEC(4) into VEC(4). */ + vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4) - /* Test YMM4 against itself. Store any CHAR mismatches in k1. + /* Test VEC(4) against itself. Store any CHAR mismatches in k1. */ - VPTEST %YMM4, %YMM4, %k1 + VPTEST %VMM(4), %VMM(4), %k1 /* k1 must go to ecx for L(return_vec_0_1_2_3). */ - kmovd %k1, %ecx - testl %ecx, %ecx + KMOV %k1, %VRCX + test %VRCX, %VRCX jnz L(return_vec_0_1_2_3) /* NB: eax must be zero to reach here. */ ret - .p2align 4,, 8 + .p2align 4,, 9 L(8x_end_return_vec_0_1_2_3): movq %rdx, %rdi L(8x_return_vec_0_1_2_3): + /* L(loop_4x_vec) leaves result in `k1` for VEC_SIZE == 64. */ +# if VEC_SIZE == 64 + KMOV %k1, %VRCX +# endif addq %rdi, %rsi L(return_vec_0_1_2_3): - VPTEST %YMM1, %YMM1, %k0 - kmovd %k0, %eax - testl %eax, %eax + VPTEST %VMM(1), %VMM(1), %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX jnz L(return_vec_0) - VPTEST %YMM2, %YMM2, %k0 - kmovd %k0, %eax - testl %eax, %eax + VPTEST %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX jnz L(return_vec_1) - VPTEST %YMM3, %YMM3, %k0 - kmovd %k0, %eax - testl %eax, %eax + VPTEST %VMM(3), %VMM(3), %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX jnz L(return_vec_2) + .p2align 4,, 2 L(return_vec_3): /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache line. */ - bsfl %ecx, %ecx + bsf %VRCX, %VRCX # ifdef USE_AS_WMEMCMP movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax xorl %edx, %edx @@ -266,11 +285,11 @@ L(return_vec_3): ret - .p2align 4 + .p2align 4,, 8 L(return_vec_1): /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one fetch block. */ - bsfl %eax, %eax + bsf %VRAX, %VRAX # ifdef USE_AS_WMEMCMP movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx xorl %edx, %edx @@ -284,11 +303,11 @@ L(return_vec_1): # endif ret - .p2align 4,, 10 + .p2align 4,, 7 L(return_vec_2): /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one fetch block. */ - bsfl %eax, %eax + bsf %VRAX, %VRAX # ifdef USE_AS_WMEMCMP movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx xorl %edx, %edx @@ -302,7 +321,7 @@ L(return_vec_2): # endif ret - .p2align 4 + .p2align 4,, 8 L(more_8x_vec): /* Set end of s1 in rdx. */ leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx @@ -316,62 +335,82 @@ L(more_8x_vec): .p2align 4 L(loop_4x_vec): - VMOVU (%rsi, %rdi), %YMM1 - vpxorq (%rdi), %YMM1, %YMM1 - VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 - vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2 - VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 - vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 - VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 - vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 - VPTEST %YMM4, %YMM4, %k1 - kmovd %k1, %ecx - testl %ecx, %ecx + VMOVU (%rsi, %rdi), %VMM(1) + vpxorq (%rdi), %VMM(1), %VMM(1) + VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2) + vpxorq VEC_SIZE(%rdi), %VMM(2), %VMM(2) + VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3) + vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3) + VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4) + vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4) + vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4) + VPTEST %VMM(4), %VMM(4), %k1 + /* If VEC_SIZE == 64 just branch with KTEST. We have free port0 + space and it allows the loop to fit in 2x cache lines + instead of 3. */ +# if VEC_SIZE == 64 + KTEST %k1, %k1 +# else + KMOV %k1, %VRCX + test %VRCX, %VRCX +# endif jnz L(8x_return_vec_0_1_2_3) subq $-(VEC_SIZE * 4), %rdi cmpq %rdx, %rdi jb L(loop_4x_vec) - subq %rdx, %rdi /* rdi has 4 * VEC_SIZE - remaining length. */ cmpl $(VEC_SIZE * 3), %edi - jae L(8x_last_1x_vec) + jge L(8x_last_1x_vec) /* Load regardless of branch. */ - VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3 - cmpl $(VEC_SIZE * 2), %edi - jae L(8x_last_2x_vec) + VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3) - vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 - - VMOVU (%rsi, %rdx), %YMM1 - vpxorq (%rdx), %YMM1, %YMM1 + /* Seperate logic as we can only use testb for VEC_SIZE == 64. + */ +# if VEC_SIZE == 64 + testb %dil, %dil + js L(8x_last_2x_vec) +# else + cmpl $(VEC_SIZE * 2), %edi + jge L(8x_last_2x_vec) +# endif - VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 - vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 - VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 - vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4 - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 - VPTEST %YMM4, %YMM4, %k1 - kmovd %k1, %ecx - testl %ecx, %ecx + vpxorq (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(3) + + VMOVU (%rsi, %rdx), %VMM(1) + vpxorq (%rdx), %VMM(1), %VMM(1) + + VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2) + vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2) + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4) + vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %VMM(1), %VMM(4) + vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4) + VPTEST %VMM(4), %VMM(4), %k1 + /* L(8x_end_return_vec_0_1_2_3) expects bitmask to still be in + `k1` if VEC_SIZE == 64. */ +# if VEC_SIZE == 64 + KTEST %k1, %k1 +# else + KMOV %k1, %VRCX + test %VRCX, %VRCX +# endif jnz L(8x_end_return_vec_0_1_2_3) /* NB: eax must be zero to reach here. */ ret /* Only entry is from L(more_8x_vec). */ - .p2align 4,, 10 + .p2align 4,, 6 L(8x_last_2x_vec): - VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1 - kmovd %k1, %eax - testl %eax, %eax + VPCMP $4, (VEC_SIZE * 2)(%rdx), %VMM(3), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX jnz L(8x_return_vec_2) - /* Naturally aligned to 16 bytes. */ + .p2align 4,, 5 L(8x_last_1x_vec): - VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1 - VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1 - kmovd %k1, %eax - testl %eax, %eax + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(1) + VPCMP $4, (VEC_SIZE * 3)(%rdx), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX jnz L(8x_return_vec_3) ret @@ -383,7 +422,7 @@ L(8x_last_1x_vec): L(8x_return_vec_2): subq $VEC_SIZE, %rdx L(8x_return_vec_3): - bsfl %eax, %eax + bsf %VRAX, %VRAX # ifdef USE_AS_WMEMCMP leaq (%rdx, %rax, CHAR_SIZE), %rax movl (VEC_SIZE * 3)(%rax), %ecx @@ -399,32 +438,34 @@ L(8x_return_vec_3): # endif ret - .p2align 4,, 10 + .p2align 4,, 8 L(last_2x_vec): /* Check second to last VEC. */ - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1 - VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 - kmovd %k1, %eax - testl %eax, %eax + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) + VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX jnz L(return_vec_1_end) /* Check last VEC. */ - .p2align 4 + .p2align 4,, 8 L(last_1x_vec): - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1 - VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 - kmovd %k1, %eax - testl %eax, %eax + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %VMM(1) + VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX jnz L(return_vec_0_end) ret - /* Don't align. Takes 2-fetch blocks either way and aligning - will cause code to spill into another cacheline. */ + /* Don't fully align. Takes 2-fetch blocks either way and + aligning will cause code to spill into another cacheline. + */ + .p2align 4,, 3 L(return_vec_1_end): /* Use bsf to save code size. This is necessary to have L(one_or_less) fit in aligning bytes between. */ - bsfl %eax, %eax + bsf %VRAX, %VRAX addl %edx, %eax # ifdef USE_AS_WMEMCMP movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx @@ -439,10 +480,11 @@ L(return_vec_1_end): # endif ret + .p2align 4,, 2 /* Don't align. Takes 2-fetch blocks either way and aligning will cause code to spill into another cacheline. */ L(return_vec_0_end): - tzcntl %eax, %eax + bsf %VRAX, %VRAX addl %edx, %eax # ifdef USE_AS_WMEMCMP movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx @@ -456,7 +498,7 @@ L(return_vec_0_end): subl %ecx, %eax # endif ret - /* 1-byte until next cache line. */ - + /* evex256: 2-byte until next cache line. evex512: 46-bytes + until next cache line. */ END (MEMCMP) #endif -- 2.34.1 ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes 2022-10-29 20:19 ` [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes Noah Goldstein @ 2022-10-31 15:47 ` H.J. Lu 0 siblings, 0 replies; 7+ messages in thread From: H.J. Lu @ 2022-10-31 15:47 UTC (permalink / raw) To: Noah Goldstein; +Cc: libc-alpha, carlos On Sat, Oct 29, 2022 at 1:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > The only change to the existing generated code is `tzcnt` -> `bsf` to > save a byte of code size here and there. > > Rewriting with VMM API allows for memcmp-evex-movbe to be used with > evex512 by including "x86-evex512-vecs.h" at the top. > > Complete check passes on x86-64. > --- > sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 308 +++++++++++-------- > 1 file changed, 175 insertions(+), 133 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S > index bc017768be..f6c379831e 100644 > --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S > +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S > @@ -62,44 +62,38 @@ Latency: > # define MEMCMP __memcmp_evex_movbe > # endif > > -# define VMOVU vmovdqu64 > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > +# endif > > # ifdef USE_AS_WMEMCMP > # define VMOVU_MASK vmovdqu32 > # define CHAR_SIZE 4 > # define VPCMP vpcmpd > +# define VPCMPEQ vpcmpeqd > # define VPTEST vptestmd > + > +# define USE_WIDE_CHAR > # else > # define VMOVU_MASK vmovdqu8 > # define CHAR_SIZE 1 > # define VPCMP vpcmpub > +# define VPCMPEQ vpcmpeqb > # define VPTEST vptestmb > # endif > > +# include "reg-macros.h" > > -# define VEC_SIZE 32 > # define PAGE_SIZE 4096 > # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > -# define XMM0 xmm16 > -# define XMM1 xmm17 > -# define XMM2 xmm18 > -# define YMM0 ymm16 > -# define XMM1 xmm17 > -# define XMM2 xmm18 > -# define YMM1 ymm17 > -# define YMM2 ymm18 > -# define YMM3 ymm19 > -# define YMM4 ymm20 > -# define YMM5 ymm21 > -# define YMM6 ymm22 > > /* Warning! > wmemcmp has to use SIGNED comparison for elements. > memcmp has to use UNSIGNED comparison for elemnts. > */ > > - .section .text.evex,"ax",@progbits > + .section SECTION(.text), "ax", @progbits > /* Cache align memcmp entry. This allows for much more thorough > frontend optimization. */ > ENTRY_P2ALIGN (MEMCMP, 6) > @@ -111,23 +105,40 @@ ENTRY_P2ALIGN (MEMCMP, 6) > /* Fall through for [0, VEC_SIZE] as its the hottest. */ > ja L(more_1x_vec) > > - /* Create mask for CHAR's we want to compare. This allows us to > - avoid having to include page cross logic. */ > - movl $-1, %ecx > - bzhil %edx, %ecx, %ecx > - kmovd %ecx, %k2 > + /* Create mask of bytes that are guranteed to be valid because > + of length (edx). Using masked movs allows us to skip checks > + for page crosses/zero size. */ > + mov $-1, %VRAX > + bzhi %VRDX, %VRAX, %VRAX > + /* NB: A `jz` might be useful here. Page-faults that are > + invalidated by predicate execution (the evex mask) can be > + very slow. The expectation is this is not the norm so and > + "most" code will not regularly call 'memcmp' with length = 0 > + and memory that is not wired up. */ > + KMOV %VRAX, %k2 > + > + > > /* Safe to load full ymm with mask. */ > - VMOVU_MASK (%rsi), %YMM2{%k2} > - VPCMP $4,(%rdi), %YMM2, %k1{%k2} > - kmovd %k1, %eax > - testl %eax, %eax > + VMOVU_MASK (%rsi), %VMM(2){%k2}{z} > + /* Slightly different method for VEC_SIZE == 64 to save a bit of > + code size. This allows us to fit L(return_vec_0) entirely in > + the first cache line. */ > +# if VEC_SIZE == 64 > + VPCMPEQ (%rdi), %VMM(2), %k1{%k2} > + KMOV %k1, %VRCX > + sub %VRCX, %VRAX > +# else > + VPCMP $4, (%rdi), %VMM(2), %k1{%k2} > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > +# endif > jnz L(return_vec_0) > ret > > - .p2align 4 > + .p2align 4,, 11 > L(return_vec_0): > - tzcntl %eax, %eax > + bsf %VRAX, %VRAX > # ifdef USE_AS_WMEMCMP > movl (%rdi, %rax, CHAR_SIZE), %ecx > xorl %edx, %edx > @@ -138,33 +149,36 @@ L(return_vec_0): > leal -1(%rdx, %rdx), %eax > # else > movzbl (%rsi, %rax), %ecx > +# if VEC_SIZE == 64 > + movb (%rdi, %rax), %al > +# else > movzbl (%rdi, %rax), %eax > +# endif > subl %ecx, %eax > # endif > ret > > - > - .p2align 4 > + .p2align 4,, 11 > L(more_1x_vec): > /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ > - VMOVU (%rsi), %YMM1 > + VMOVU (%rsi), %VMM(1) > /* Use compare not equals to directly check for mismatch. */ > - VPCMP $4,(%rdi), %YMM1, %k1 > - kmovd %k1, %eax > + VPCMP $4, (%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > /* NB: eax must be destination register if going to > - L(return_vec_[0,2]). For L(return_vec_3) destination register > - must be ecx. */ > - testl %eax, %eax > + L(return_vec_[0,2]). For L(return_vec_3) destination > + register must be ecx. */ > + test %VRAX, %VRAX > jnz L(return_vec_0) > > cmpq $(CHAR_PER_VEC * 2), %rdx > jbe L(last_1x_vec) > > /* Check second VEC no matter what. */ > - VMOVU VEC_SIZE(%rsi), %YMM2 > - VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1 > - kmovd %k1, %eax > - testl %eax, %eax > + VMOVU VEC_SIZE(%rsi), %VMM(2) > + VPCMP $4, VEC_SIZE(%rdi), %VMM(2), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > jnz L(return_vec_1) > > /* Less than 4 * VEC. */ > @@ -172,16 +186,16 @@ L(more_1x_vec): > jbe L(last_2x_vec) > > /* Check third and fourth VEC no matter what. */ > - VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 > - VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1 > - kmovd %k1, %eax > - testl %eax, %eax > + VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3) > + VPCMP $4, (VEC_SIZE * 2)(%rdi), %VMM(3), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > jnz L(return_vec_2) > > - VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 > - VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1 > - kmovd %k1, %ecx > - testl %ecx, %ecx > + VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4) > + VPCMP $4, (VEC_SIZE * 3)(%rdi), %VMM(4), %k1 > + KMOV %k1, %VRCX > + test %VRCX, %VRCX > jnz L(return_vec_3) > > /* Go to 4x VEC loop. */ > @@ -192,8 +206,8 @@ L(more_1x_vec): > branches. */ > > /* Load first two VEC from s2 before adjusting addresses. */ > - VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1 > - VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2 > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(2) > leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi > leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi > > @@ -202,56 +216,61 @@ L(more_1x_vec): > > /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it > will have some 1s. */ > - vpxorq (%rdi), %YMM1, %YMM1 > - vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2 > + vpxorq (%rdi), %VMM(1), %VMM(1) > + vpxorq (VEC_SIZE)(%rdi), %VMM(2), %VMM(2) > > - VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 > - vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 > + VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3) > + vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3) > > - VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 > - /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while > - oring with YMM1. Result is stored in YMM4. */ > - vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 > + VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4) > + /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with VEC(4) while > + oring with VEC(1). Result is stored in VEC(4). */ > + vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4) > > - /* Or together YMM2, YMM3, and YMM4 into YMM4. */ > - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 > + /* Or together VEC(2), VEC(3), and VEC(4) into VEC(4). */ > + vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4) > > - /* Test YMM4 against itself. Store any CHAR mismatches in k1. > + /* Test VEC(4) against itself. Store any CHAR mismatches in k1. > */ > - VPTEST %YMM4, %YMM4, %k1 > + VPTEST %VMM(4), %VMM(4), %k1 > /* k1 must go to ecx for L(return_vec_0_1_2_3). */ > - kmovd %k1, %ecx > - testl %ecx, %ecx > + KMOV %k1, %VRCX > + test %VRCX, %VRCX > jnz L(return_vec_0_1_2_3) > /* NB: eax must be zero to reach here. */ > ret > > > - .p2align 4,, 8 > + .p2align 4,, 9 > L(8x_end_return_vec_0_1_2_3): > movq %rdx, %rdi > L(8x_return_vec_0_1_2_3): > + /* L(loop_4x_vec) leaves result in `k1` for VEC_SIZE == 64. */ > +# if VEC_SIZE == 64 > + KMOV %k1, %VRCX > +# endif > addq %rdi, %rsi > L(return_vec_0_1_2_3): > - VPTEST %YMM1, %YMM1, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + VPTEST %VMM(1), %VMM(1), %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > jnz L(return_vec_0) > > - VPTEST %YMM2, %YMM2, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + VPTEST %VMM(2), %VMM(2), %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > jnz L(return_vec_1) > > - VPTEST %YMM3, %YMM3, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + VPTEST %VMM(3), %VMM(3), %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > jnz L(return_vec_2) > + .p2align 4,, 2 > L(return_vec_3): > /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one > fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache > line. */ > - bsfl %ecx, %ecx > + bsf %VRCX, %VRCX > # ifdef USE_AS_WMEMCMP > movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax > xorl %edx, %edx > @@ -266,11 +285,11 @@ L(return_vec_3): > ret > > > - .p2align 4 > + .p2align 4,, 8 > L(return_vec_1): > /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one > fetch block. */ > - bsfl %eax, %eax > + bsf %VRAX, %VRAX > # ifdef USE_AS_WMEMCMP > movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx > xorl %edx, %edx > @@ -284,11 +303,11 @@ L(return_vec_1): > # endif > ret > > - .p2align 4,, 10 > + .p2align 4,, 7 > L(return_vec_2): > /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one > fetch block. */ > - bsfl %eax, %eax > + bsf %VRAX, %VRAX > # ifdef USE_AS_WMEMCMP > movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx > xorl %edx, %edx > @@ -302,7 +321,7 @@ L(return_vec_2): > # endif > ret > > - .p2align 4 > + .p2align 4,, 8 > L(more_8x_vec): > /* Set end of s1 in rdx. */ > leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx > @@ -316,62 +335,82 @@ L(more_8x_vec): > > .p2align 4 > L(loop_4x_vec): > - VMOVU (%rsi, %rdi), %YMM1 > - vpxorq (%rdi), %YMM1, %YMM1 > - VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 > - vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2 > - VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 > - vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 > - VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 > - vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 > - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 > - VPTEST %YMM4, %YMM4, %k1 > - kmovd %k1, %ecx > - testl %ecx, %ecx > + VMOVU (%rsi, %rdi), %VMM(1) > + vpxorq (%rdi), %VMM(1), %VMM(1) > + VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2) > + vpxorq VEC_SIZE(%rdi), %VMM(2), %VMM(2) > + VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3) > + vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3) > + VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4) > + vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(4) > + vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4) > + VPTEST %VMM(4), %VMM(4), %k1 > + /* If VEC_SIZE == 64 just branch with KTEST. We have free port0 > + space and it allows the loop to fit in 2x cache lines > + instead of 3. */ > +# if VEC_SIZE == 64 > + KTEST %k1, %k1 > +# else > + KMOV %k1, %VRCX > + test %VRCX, %VRCX > +# endif > jnz L(8x_return_vec_0_1_2_3) > subq $-(VEC_SIZE * 4), %rdi > cmpq %rdx, %rdi > jb L(loop_4x_vec) > - > subq %rdx, %rdi > /* rdi has 4 * VEC_SIZE - remaining length. */ > cmpl $(VEC_SIZE * 3), %edi > - jae L(8x_last_1x_vec) > + jge L(8x_last_1x_vec) > /* Load regardless of branch. */ > - VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3 > - cmpl $(VEC_SIZE * 2), %edi > - jae L(8x_last_2x_vec) > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3) > > - vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 > - > - VMOVU (%rsi, %rdx), %YMM1 > - vpxorq (%rdx), %YMM1, %YMM1 > + /* Seperate logic as we can only use testb for VEC_SIZE == 64. > + */ > +# if VEC_SIZE == 64 > + testb %dil, %dil > + js L(8x_last_2x_vec) > +# else > + cmpl $(VEC_SIZE * 2), %edi > + jge L(8x_last_2x_vec) > +# endif > > - VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 > - vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 > - VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 > - vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4 > - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 > - VPTEST %YMM4, %YMM4, %k1 > - kmovd %k1, %ecx > - testl %ecx, %ecx > + vpxorq (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(3) > + > + VMOVU (%rsi, %rdx), %VMM(1) > + vpxorq (%rdx), %VMM(1), %VMM(1) > + > + VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2) > + vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2) > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4) > + vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %VMM(1), %VMM(4) > + vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4) > + VPTEST %VMM(4), %VMM(4), %k1 > + /* L(8x_end_return_vec_0_1_2_3) expects bitmask to still be in > + `k1` if VEC_SIZE == 64. */ > +# if VEC_SIZE == 64 > + KTEST %k1, %k1 > +# else > + KMOV %k1, %VRCX > + test %VRCX, %VRCX > +# endif > jnz L(8x_end_return_vec_0_1_2_3) > /* NB: eax must be zero to reach here. */ > ret > > /* Only entry is from L(more_8x_vec). */ > - .p2align 4,, 10 > + .p2align 4,, 6 > L(8x_last_2x_vec): > - VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1 > - kmovd %k1, %eax > - testl %eax, %eax > + VPCMP $4, (VEC_SIZE * 2)(%rdx), %VMM(3), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > jnz L(8x_return_vec_2) > - /* Naturally aligned to 16 bytes. */ > + .p2align 4,, 5 > L(8x_last_1x_vec): > - VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1 > - VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1 > - kmovd %k1, %eax > - testl %eax, %eax > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(1) > + VPCMP $4, (VEC_SIZE * 3)(%rdx), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > jnz L(8x_return_vec_3) > ret > > @@ -383,7 +422,7 @@ L(8x_last_1x_vec): > L(8x_return_vec_2): > subq $VEC_SIZE, %rdx > L(8x_return_vec_3): > - bsfl %eax, %eax > + bsf %VRAX, %VRAX > # ifdef USE_AS_WMEMCMP > leaq (%rdx, %rax, CHAR_SIZE), %rax > movl (VEC_SIZE * 3)(%rax), %ecx > @@ -399,32 +438,34 @@ L(8x_return_vec_3): > # endif > ret > > - .p2align 4,, 10 > + .p2align 4,, 8 > L(last_2x_vec): > /* Check second to last VEC. */ > - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1 > - VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 > - kmovd %k1, %eax > - testl %eax, %eax > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > + VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > jnz L(return_vec_1_end) > > /* Check last VEC. */ > - .p2align 4 > + .p2align 4,, 8 > L(last_1x_vec): > - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1 > - VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 > - kmovd %k1, %eax > - testl %eax, %eax > + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %VMM(1) > + VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > jnz L(return_vec_0_end) > ret > > > - /* Don't align. Takes 2-fetch blocks either way and aligning > - will cause code to spill into another cacheline. */ > + /* Don't fully align. Takes 2-fetch blocks either way and > + aligning will cause code to spill into another cacheline. > + */ > + .p2align 4,, 3 > L(return_vec_1_end): > /* Use bsf to save code size. This is necessary to have > L(one_or_less) fit in aligning bytes between. */ > - bsfl %eax, %eax > + bsf %VRAX, %VRAX > addl %edx, %eax > # ifdef USE_AS_WMEMCMP > movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx > @@ -439,10 +480,11 @@ L(return_vec_1_end): > # endif > ret > > + .p2align 4,, 2 > /* Don't align. Takes 2-fetch blocks either way and aligning > will cause code to spill into another cacheline. */ > L(return_vec_0_end): > - tzcntl %eax, %eax > + bsf %VRAX, %VRAX > addl %edx, %eax > # ifdef USE_AS_WMEMCMP > movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx > @@ -456,7 +498,7 @@ L(return_vec_0_end): > subl %ecx, %eax > # endif > ret > - /* 1-byte until next cache line. */ > - > + /* evex256: 2-byte until next cache line. evex512: 46-bytes > + until next cache line. */ > END (MEMCMP) > #endif > -- > 2.34.1 > LGTM. Thanks. -- H.J. ^ permalink raw reply [flat|nested] 7+ messages in thread
* [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S and minor changes 2022-10-29 20:19 [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Noah Goldstein 2022-10-29 20:19 ` [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes Noah Goldstein @ 2022-10-29 20:19 ` Noah Goldstein 2022-10-31 15:48 ` H.J. Lu 2022-10-31 13:19 ` [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Siddhesh Poyarekar 2 siblings, 1 reply; 7+ messages in thread From: Noah Goldstein @ 2022-10-29 20:19 UTC (permalink / raw) To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos Changes to generated code are: 1. In a few places use `vpcmpeqb` instead of `vpcmpneq` to save a byte of code size. 2. Add a branch for length <= (VEC_SIZE * 6) as opposed to doing the entire block of [VEC_SIZE * 4 + 1, VEC_SIZE * 8] in a single basic-block (the space to add the extra branch without changing code size is bought with the above change). Change (2) has roughly a 20-25% speedup for sizes in [VEC_SIZE * 4 + 1, VEC_SIZE * 6] and negligible to no-cost for [VEC_SIZE * 6 + 1, VEC_SIZE * 8] From N=10 runs on Tigerlake: align1,align2 ,length ,result ,New TIme ,Cur Time,New Time / Old Time 0 ,0 ,129 ,0 ,5.404 ,6.887 ,0.785 0 ,0 ,129 ,1 ,5.308 ,6.826 ,0.778 0 ,0 ,129 ,18446744073709551615 ,5.359 ,6.823 ,0.785 0 ,0 ,161 ,0 ,5.284 ,6.827 ,0.774 0 ,0 ,161 ,1 ,5.317 ,6.745 ,0.788 0 ,0 ,161 ,18446744073709551615 ,5.406 ,6.778 ,0.798 0 ,0 ,193 ,0 ,6.969 ,6.832 ,1.000 0 ,0 ,193 ,1 ,6.943 ,6.748 ,1.029 0 ,0 ,193 ,18446744073709551615 ,6.997 ,6.728 ,1.011 0 ,0 ,225 ,0 ,7.144 ,6.746 ,0.989 0 ,0 ,225 ,1 ,7.218 ,6.683 ,1.003 0 ,0 ,225 ,18446744073709551615 ,6.864 ,6.767 ,0.992 0 ,0 ,256 ,0 ,5.423 ,5.482 ,0.989 0 ,0 ,256 ,1 ,5.348 ,5.465 ,0.978 0 ,0 ,256 ,18446744073709551615 ,5.321 ,5.518 ,0.964 Rewriting with VMM API allows for memcmpeq-evex to be used with evex512 by including "x86-evex512-vecs.h" at the top. Complete check passes on x86-64. --- sysdeps/x86_64/multiarch/memcmpeq-evex.S | 255 ++++++++++++++--------- 1 file changed, 155 insertions(+), 100 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S index 41124ef1d3..671d19393e 100644 --- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S +++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S @@ -41,24 +41,53 @@ # define MEMCMPEQ __memcmpeq_evex # endif +# ifndef VEC_SIZE +# include "x86-evex512-vecs.h" +# endif +# include "reg-macros.h" + + +# if VEC_SIZE == 32 + +# define TEST_ZERO_VCMP(reg) inc %VGPR(reg) +# define TEST_ZERO(reg) test %VGPR(reg), %VGPR(reg) + +# define TO_32BIT_P1(reg) /* Do nothing. */ +# define TO_32BIT_P2(reg) /* Do nothing. */ +# define TO_32BIT(reg) /* Do nothing. */ + +# define VEC_CMP VPCMPEQ + +# elif VEC_SIZE == 64 + +# define TEST_ZERO_VCMP(reg) TEST_ZERO(reg) +# define TEST_ZERO(reg) neg %VGPR(reg) + + + /* VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit + int. We have two methods for this. If the mask with branched + on, we use `neg` for the branch then `sbb` to get the 32-bit + return. If the mask was no branched on, we just use + `popcntq`. */ +# define TO_32BIT_P1(reg) TEST_ZERO(reg) +# define TO_32BIT_P2(reg) sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32) +# define TO_32BIT(reg) popcntq %reg, %reg + +# define VEC_CMP VPCMPNEQ + +# else +# error "Unsupported VEC_SIZE" +# endif + + # define VMOVU_MASK vmovdqu8 -# define VMOVU vmovdqu64 -# define VPCMP vpcmpub +# define VPCMPNEQ vpcmpneqb +# define VPCMPEQ vpcmpeqb # define VPTEST vptestmb -# define VEC_SIZE 32 # define PAGE_SIZE 4096 -# define YMM0 ymm16 -# define YMM1 ymm17 -# define YMM2 ymm18 -# define YMM3 ymm19 -# define YMM4 ymm20 -# define YMM5 ymm21 -# define YMM6 ymm22 - - - .section .text.evex, "ax", @progbits + .section SECTION(.text), "ax", @progbits ENTRY_P2ALIGN (MEMCMPEQ, 6) # ifdef __ILP32__ /* Clear the upper 32 bits. */ @@ -69,47 +98,54 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6) ja L(more_1x_vec) /* Create mask of bytes that are guranteed to be valid because - of length (edx). Using masked movs allows us to skip checks for - page crosses/zero size. */ - movl $-1, %ecx - bzhil %edx, %ecx, %ecx - kmovd %ecx, %k2 + of length (edx). Using masked movs allows us to skip checks + for page crosses/zero size. */ + mov $-1, %VRAX + bzhi %VRDX, %VRAX, %VRAX + /* NB: A `jz` might be useful here. Page-faults that are + invalidated by predicate execution (the evex mask) can be + very slow. The expectation is this is not the norm so and + "most" code will not regularly call 'memcmp' with length = 0 + and memory that is not wired up. */ + KMOV %VRAX, %k2 /* Use masked loads as VEC_SIZE could page cross where length (edx) would not. */ - VMOVU_MASK (%rsi), %YMM2{%k2} - VPCMP $4,(%rdi), %YMM2, %k1{%k2} - kmovd %k1, %eax + VMOVU_MASK (%rsi), %VMM(2){%k2}{z} + VPCMPNEQ (%rdi), %VMM(2), %k1{%k2} + KMOV %k1, %VRAX + TO_32BIT (VRAX) ret - + .p2align 4,, 3 L(last_1x_vec): - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1 - VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1 - kmovd %k1, %eax + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1) + VPCMPNEQ -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %k1 + KMOV %k1, %VRAX + TO_32BIT_P1 (rax) L(return_neq0): + TO_32BIT_P2 (rax) ret - - .p2align 4 + .p2align 4,, 12 L(more_1x_vec): /* From VEC + 1 to 2 * VEC. */ - VMOVU (%rsi), %YMM1 + VMOVU (%rsi), %VMM(1) /* Use compare not equals to directly check for mismatch. */ - VPCMP $4,(%rdi), %YMM1, %k1 - kmovd %k1, %eax - testl %eax, %eax + VPCMPNEQ (%rdi), %VMM(1), %k1 + KMOV %k1, %VRAX + TEST_ZERO (rax) jnz L(return_neq0) cmpq $(VEC_SIZE * 2), %rdx jbe L(last_1x_vec) /* Check second VEC no matter what. */ - VMOVU VEC_SIZE(%rsi), %YMM2 - VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1 - kmovd %k1, %eax - testl %eax, %eax + VMOVU VEC_SIZE(%rsi), %VMM(2) + VPCMPNEQ VEC_SIZE(%rdi), %VMM(2), %k1 + KMOV %k1, %VRAX + TEST_ZERO (rax) jnz L(return_neq0) /* Less than 4 * VEC. */ @@ -117,16 +153,16 @@ L(more_1x_vec): jbe L(last_2x_vec) /* Check third and fourth VEC no matter what. */ - VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 - VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1 - kmovd %k1, %eax - testl %eax, %eax + VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3) + VEC_CMP (VEC_SIZE * 2)(%rdi), %VMM(3), %k1 + KMOV %k1, %VRAX + TEST_ZERO_VCMP (rax) jnz L(return_neq0) - VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 - VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1 - kmovd %k1, %eax - testl %eax, %eax + VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4) + VEC_CMP (VEC_SIZE * 3)(%rdi), %VMM(4), %k1 + KMOV %k1, %VRAX + TEST_ZERO_VCMP (rax) jnz L(return_neq0) /* Go to 4x VEC loop. */ @@ -136,8 +172,8 @@ L(more_1x_vec): /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any branches. */ - VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %YMM1 - VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %YMM2 + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2) addq %rdx, %rdi /* Wait to load from s1 until addressed adjust due to @@ -145,26 +181,32 @@ L(more_1x_vec): /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it will have some 1s. */ - vpxorq -(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1 - /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while - oring with YMM1. Result is stored in YMM1. */ - vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2 - - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM3 - vpxorq -(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 - /* Or together YMM1, YMM2, and YMM3 into YMM3. */ - VMOVU -(VEC_SIZE)(%rsi, %rdx), %YMM4 - vpxorq -(VEC_SIZE)(%rdi), %YMM4, %YMM4 - - /* Or together YMM2, YMM3, and YMM4 into YMM4. */ - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 - - /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */ - VPTEST %YMM4, %YMM4, %k1 - kmovd %k1, %eax + vpxorq -(VEC_SIZE * 1)(%rdi), %VMM(1), %VMM(1) + /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while + oring with VEC(1). Result is stored in VEC(1). */ + vpternlogd $0xde, -(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(2) + + cmpl $(VEC_SIZE * 6), %edx + jbe L(4x_last_2x_vec) + + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(3) + vpxorq -(VEC_SIZE * 3)(%rdi), %VMM(3), %VMM(3) + /* Or together VEC(1), VEC(2), and VEC(3) into VEC(3). */ + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(4) + vpxorq -(VEC_SIZE * 4)(%rdi), %VMM(4), %VMM(4) + + /* Or together VEC(4), VEC(3), and VEC(2) into VEC(2). */ + vpternlogd $0xfe, %VMM(4), %VMM(3), %VMM(2) + + /* Compare VEC(4) with 0. If any 1s s1 and s2 don't match. */ +L(4x_last_2x_vec): + VPTEST %VMM(2), %VMM(2), %k1 + KMOV %k1, %VRAX + TO_32BIT (VRAX) ret - .p2align 4 + + .p2align 4,, 10 L(more_8x_vec): /* Set end of s1 in rdx. */ leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx @@ -175,67 +217,80 @@ L(more_8x_vec): andq $-VEC_SIZE, %rdi /* Adjust because first 4x vec where check already. */ subq $-(VEC_SIZE * 4), %rdi - .p2align 4 + .p2align 5,, 12 + .p2align 4,, 8 L(loop_4x_vec): - VMOVU (%rsi, %rdi), %YMM1 - vpxorq (%rdi), %YMM1, %YMM1 + VMOVU (%rsi, %rdi), %VMM(1) + vpxorq (%rdi), %VMM(1), %VMM(1) - VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 - vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2 + VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2) + vpternlogd $0xde, (VEC_SIZE)(%rdi), %VMM(1), %VMM(2) - VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 - vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 + VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3) + vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3) - VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 - vpxorq (VEC_SIZE * 3)(%rdi), %YMM4, %YMM4 + VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4) + vpxorq (VEC_SIZE * 3)(%rdi), %VMM(4), %VMM(4) - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 - VPTEST %YMM4, %YMM4, %k1 - kmovd %k1, %eax - testl %eax, %eax + vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4) + VPTEST %VMM(4), %VMM(4), %k1 + KMOV %k1, %VRAX + TEST_ZERO (rax) jnz L(return_neq2) subq $-(VEC_SIZE * 4), %rdi cmpq %rdx, %rdi jb L(loop_4x_vec) subq %rdx, %rdi - VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 - vpxorq (VEC_SIZE * 3)(%rdx), %YMM4, %YMM4 + + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4) + vpxorq (VEC_SIZE * 3)(%rdx), %VMM(4), %VMM(4) /* rdi has 4 * VEC_SIZE - remaining length. */ - cmpl $(VEC_SIZE * 3), %edi - jae L(8x_last_1x_vec) + /* Load regardless of branch. */ - VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3 - /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while - oring with YMM4. Result is stored in YMM4. */ - vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4 + VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3) + /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while + oring with VEC(4). Result is stored in VEC(4). */ + vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4) + + /* Seperate logic as we can only use testb for VEC_SIZE == 64. + */ +# if VEC_SIZE == 64 + testb %dil, %dil + js L(8x_last_2x_vec) +# else cmpl $(VEC_SIZE * 2), %edi - jae L(8x_last_2x_vec) + jge L(8x_last_2x_vec) +# endif - VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 - vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 + VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2) + vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2) - VMOVU (%rsi, %rdx), %YMM1 - vpxorq (%rdx), %YMM1, %YMM1 + VMOVU (%rsi, %rdx), %VMM(1) + vpxorq (%rdx), %VMM(1), %VMM(1) - vpternlogd $0xfe, %YMM1, %YMM2, %YMM4 + vpternlogd $0xfe, %VMM(1), %VMM(2), %VMM(4) L(8x_last_1x_vec): L(8x_last_2x_vec): - VPTEST %YMM4, %YMM4, %k1 - kmovd %k1, %eax + VPTEST %VMM(4), %VMM(4), %k1 + KMOV %k1, %VRAX + TO_32BIT_P1 (rax) L(return_neq2): + TO_32BIT_P2 (rax) ret - .p2align 4,, 8 + .p2align 4,, 4 L(last_2x_vec): - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1 - vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1 - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2 - vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2 - VPTEST %YMM2, %YMM2, %k1 - kmovd %k1, %eax + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(1) + vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %VMM(1), %VMM(1) + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2) + vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %VMM(2) + VPTEST %VMM(2), %VMM(2), %k1 + KMOV %k1, %VRAX + TO_32BIT (VRAX) ret - /* 1 Bytes from next cache line. */ + /* evex256: 1 Bytes from next cache line. evex512: 15 Bytes from + next cache line. */ END (MEMCMPEQ) #endif -- 2.34.1 ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S and minor changes 2022-10-29 20:19 ` [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S " Noah Goldstein @ 2022-10-31 15:48 ` H.J. Lu 2022-10-31 16:42 ` Noah Goldstein 0 siblings, 1 reply; 7+ messages in thread From: H.J. Lu @ 2022-10-31 15:48 UTC (permalink / raw) To: Noah Goldstein; +Cc: libc-alpha, carlos On Sat, Oct 29, 2022 at 1:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Changes to generated code are: > 1. In a few places use `vpcmpeqb` instead of `vpcmpneq` to save a > byte of code size. > 2. Add a branch for length <= (VEC_SIZE * 6) as opposed to doing > the entire block of [VEC_SIZE * 4 + 1, VEC_SIZE * 8] in a > single basic-block (the space to add the extra branch without > changing code size is bought with the above change). > > Change (2) has roughly a 20-25% speedup for sizes in [VEC_SIZE * 4 + > 1, VEC_SIZE * 6] and negligible to no-cost for [VEC_SIZE * 6 + 1, > VEC_SIZE * 8] > > From N=10 runs on Tigerlake: > > align1,align2 ,length ,result ,New TIme ,Cur Time,New Time / Old Time > 0 ,0 ,129 ,0 ,5.404 ,6.887 ,0.785 > 0 ,0 ,129 ,1 ,5.308 ,6.826 ,0.778 > 0 ,0 ,129 ,18446744073709551615 ,5.359 ,6.823 ,0.785 > 0 ,0 ,161 ,0 ,5.284 ,6.827 ,0.774 > 0 ,0 ,161 ,1 ,5.317 ,6.745 ,0.788 > 0 ,0 ,161 ,18446744073709551615 ,5.406 ,6.778 ,0.798 > > 0 ,0 ,193 ,0 ,6.969 ,6.832 ,1.000 > 0 ,0 ,193 ,1 ,6.943 ,6.748 ,1.029 > 0 ,0 ,193 ,18446744073709551615 ,6.997 ,6.728 ,1.011 > 0 ,0 ,225 ,0 ,7.144 ,6.746 ,0.989 > 0 ,0 ,225 ,1 ,7.218 ,6.683 ,1.003 > 0 ,0 ,225 ,18446744073709551615 ,6.864 ,6.767 ,0.992 > 0 ,0 ,256 ,0 ,5.423 ,5.482 ,0.989 > 0 ,0 ,256 ,1 ,5.348 ,5.465 ,0.978 > 0 ,0 ,256 ,18446744073709551615 ,5.321 ,5.518 ,0.964 > > Rewriting with VMM API allows for memcmpeq-evex to be used with > evex512 by including "x86-evex512-vecs.h" at the top. > > Complete check passes on x86-64. > --- > sysdeps/x86_64/multiarch/memcmpeq-evex.S | 255 ++++++++++++++--------- > 1 file changed, 155 insertions(+), 100 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S > index 41124ef1d3..671d19393e 100644 > --- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S > +++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S > @@ -41,24 +41,53 @@ > # define MEMCMPEQ __memcmpeq_evex > # endif > > +# ifndef VEC_SIZE > +# include "x86-evex512-vecs.h" > +# endif > +# include "reg-macros.h" > + > + > +# if VEC_SIZE == 32 > + > +# define TEST_ZERO_VCMP(reg) inc %VGPR(reg) > +# define TEST_ZERO(reg) test %VGPR(reg), %VGPR(reg) > + > +# define TO_32BIT_P1(reg) /* Do nothing. */ > +# define TO_32BIT_P2(reg) /* Do nothing. */ > +# define TO_32BIT(reg) /* Do nothing. */ > + > +# define VEC_CMP VPCMPEQ > + > +# elif VEC_SIZE == 64 > + > +# define TEST_ZERO_VCMP(reg) TEST_ZERO(reg) > +# define TEST_ZERO(reg) neg %VGPR(reg) > + > + > + /* VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit > + int. We have two methods for this. If the mask with branched > + on, we use `neg` for the branch then `sbb` to get the 32-bit > + return. If the mask was no branched on, we just use > + `popcntq`. */ > +# define TO_32BIT_P1(reg) TEST_ZERO(reg) > +# define TO_32BIT_P2(reg) sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32) > +# define TO_32BIT(reg) popcntq %reg, %reg > + > +# define VEC_CMP VPCMPNEQ > + > +# else > +# error "Unsupported VEC_SIZE" > +# endif > + > + > # define VMOVU_MASK vmovdqu8 > -# define VMOVU vmovdqu64 > -# define VPCMP vpcmpub > +# define VPCMPNEQ vpcmpneqb > +# define VPCMPEQ vpcmpeqb > # define VPTEST vptestmb > > -# define VEC_SIZE 32 > # define PAGE_SIZE 4096 > > -# define YMM0 ymm16 > -# define YMM1 ymm17 > -# define YMM2 ymm18 > -# define YMM3 ymm19 > -# define YMM4 ymm20 > -# define YMM5 ymm21 > -# define YMM6 ymm22 > - > - > - .section .text.evex, "ax", @progbits > + .section SECTION(.text), "ax", @progbits > ENTRY_P2ALIGN (MEMCMPEQ, 6) > # ifdef __ILP32__ > /* Clear the upper 32 bits. */ > @@ -69,47 +98,54 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6) > ja L(more_1x_vec) > > /* Create mask of bytes that are guranteed to be valid because > - of length (edx). Using masked movs allows us to skip checks for > - page crosses/zero size. */ > - movl $-1, %ecx > - bzhil %edx, %ecx, %ecx > - kmovd %ecx, %k2 > + of length (edx). Using masked movs allows us to skip checks > + for page crosses/zero size. */ > + mov $-1, %VRAX > + bzhi %VRDX, %VRAX, %VRAX > + /* NB: A `jz` might be useful here. Page-faults that are > + invalidated by predicate execution (the evex mask) can be > + very slow. The expectation is this is not the norm so and > + "most" code will not regularly call 'memcmp' with length = 0 > + and memory that is not wired up. */ > + KMOV %VRAX, %k2 > > /* Use masked loads as VEC_SIZE could page cross where length > (edx) would not. */ > - VMOVU_MASK (%rsi), %YMM2{%k2} > - VPCMP $4,(%rdi), %YMM2, %k1{%k2} > - kmovd %k1, %eax > + VMOVU_MASK (%rsi), %VMM(2){%k2}{z} > + VPCMPNEQ (%rdi), %VMM(2), %k1{%k2} > + KMOV %k1, %VRAX > + TO_32BIT (VRAX) > ret > > - > + .p2align 4,, 3 > L(last_1x_vec): > - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1 > - VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1 > - kmovd %k1, %eax > + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1) > + VPCMPNEQ -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %k1 > + KMOV %k1, %VRAX > + TO_32BIT_P1 (rax) > L(return_neq0): > + TO_32BIT_P2 (rax) > ret > > > - > - .p2align 4 > + .p2align 4,, 12 > L(more_1x_vec): > /* From VEC + 1 to 2 * VEC. */ > - VMOVU (%rsi), %YMM1 > + VMOVU (%rsi), %VMM(1) > /* Use compare not equals to directly check for mismatch. */ > - VPCMP $4,(%rdi), %YMM1, %k1 > - kmovd %k1, %eax > - testl %eax, %eax > + VPCMPNEQ (%rdi), %VMM(1), %k1 > + KMOV %k1, %VRAX > + TEST_ZERO (rax) > jnz L(return_neq0) > > cmpq $(VEC_SIZE * 2), %rdx > jbe L(last_1x_vec) > > /* Check second VEC no matter what. */ > - VMOVU VEC_SIZE(%rsi), %YMM2 > - VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1 > - kmovd %k1, %eax > - testl %eax, %eax > + VMOVU VEC_SIZE(%rsi), %VMM(2) > + VPCMPNEQ VEC_SIZE(%rdi), %VMM(2), %k1 > + KMOV %k1, %VRAX > + TEST_ZERO (rax) > jnz L(return_neq0) > > /* Less than 4 * VEC. */ > @@ -117,16 +153,16 @@ L(more_1x_vec): > jbe L(last_2x_vec) > > /* Check third and fourth VEC no matter what. */ > - VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 > - VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1 > - kmovd %k1, %eax > - testl %eax, %eax > + VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3) > + VEC_CMP (VEC_SIZE * 2)(%rdi), %VMM(3), %k1 > + KMOV %k1, %VRAX > + TEST_ZERO_VCMP (rax) > jnz L(return_neq0) > > - VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 > - VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1 > - kmovd %k1, %eax > - testl %eax, %eax > + VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4) > + VEC_CMP (VEC_SIZE * 3)(%rdi), %VMM(4), %k1 > + KMOV %k1, %VRAX > + TEST_ZERO_VCMP (rax) > jnz L(return_neq0) > > /* Go to 4x VEC loop. */ > @@ -136,8 +172,8 @@ L(more_1x_vec): > /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any > branches. */ > > - VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %YMM1 > - VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %YMM2 > + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1) > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2) > addq %rdx, %rdi > > /* Wait to load from s1 until addressed adjust due to > @@ -145,26 +181,32 @@ L(more_1x_vec): > > /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it > will have some 1s. */ > - vpxorq -(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1 > - /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while > - oring with YMM1. Result is stored in YMM1. */ > - vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2 > - > - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM3 > - vpxorq -(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 > - /* Or together YMM1, YMM2, and YMM3 into YMM3. */ > - VMOVU -(VEC_SIZE)(%rsi, %rdx), %YMM4 > - vpxorq -(VEC_SIZE)(%rdi), %YMM4, %YMM4 > - > - /* Or together YMM2, YMM3, and YMM4 into YMM4. */ > - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 > - > - /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */ > - VPTEST %YMM4, %YMM4, %k1 > - kmovd %k1, %eax > + vpxorq -(VEC_SIZE * 1)(%rdi), %VMM(1), %VMM(1) > + /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while > + oring with VEC(1). Result is stored in VEC(1). */ > + vpternlogd $0xde, -(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(2) > + > + cmpl $(VEC_SIZE * 6), %edx > + jbe L(4x_last_2x_vec) > + > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(3) > + vpxorq -(VEC_SIZE * 3)(%rdi), %VMM(3), %VMM(3) > + /* Or together VEC(1), VEC(2), and VEC(3) into VEC(3). */ > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(4) > + vpxorq -(VEC_SIZE * 4)(%rdi), %VMM(4), %VMM(4) > + > + /* Or together VEC(4), VEC(3), and VEC(2) into VEC(2). */ > + vpternlogd $0xfe, %VMM(4), %VMM(3), %VMM(2) > + > + /* Compare VEC(4) with 0. If any 1s s1 and s2 don't match. */ > +L(4x_last_2x_vec): > + VPTEST %VMM(2), %VMM(2), %k1 > + KMOV %k1, %VRAX > + TO_32BIT (VRAX) > ret > > - .p2align 4 > + > + .p2align 4,, 10 > L(more_8x_vec): > /* Set end of s1 in rdx. */ > leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx > @@ -175,67 +217,80 @@ L(more_8x_vec): > andq $-VEC_SIZE, %rdi > /* Adjust because first 4x vec where check already. */ > subq $-(VEC_SIZE * 4), %rdi > - .p2align 4 > + .p2align 5,, 12 > + .p2align 4,, 8 > L(loop_4x_vec): > - VMOVU (%rsi, %rdi), %YMM1 > - vpxorq (%rdi), %YMM1, %YMM1 > + VMOVU (%rsi, %rdi), %VMM(1) > + vpxorq (%rdi), %VMM(1), %VMM(1) > > - VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 > - vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2 > + VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2) > + vpternlogd $0xde, (VEC_SIZE)(%rdi), %VMM(1), %VMM(2) > > - VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 > - vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 > + VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3) > + vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3) > > - VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 > - vpxorq (VEC_SIZE * 3)(%rdi), %YMM4, %YMM4 > + VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4) > + vpxorq (VEC_SIZE * 3)(%rdi), %VMM(4), %VMM(4) > > - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 > - VPTEST %YMM4, %YMM4, %k1 > - kmovd %k1, %eax > - testl %eax, %eax > + vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4) > + VPTEST %VMM(4), %VMM(4), %k1 > + KMOV %k1, %VRAX > + TEST_ZERO (rax) > jnz L(return_neq2) > subq $-(VEC_SIZE * 4), %rdi > cmpq %rdx, %rdi > jb L(loop_4x_vec) > > subq %rdx, %rdi > - VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 > - vpxorq (VEC_SIZE * 3)(%rdx), %YMM4, %YMM4 > + > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4) > + vpxorq (VEC_SIZE * 3)(%rdx), %VMM(4), %VMM(4) > /* rdi has 4 * VEC_SIZE - remaining length. */ > - cmpl $(VEC_SIZE * 3), %edi > - jae L(8x_last_1x_vec) > + > /* Load regardless of branch. */ > - VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3 > - /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while > - oring with YMM4. Result is stored in YMM4. */ > - vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4 > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3) > + /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while > + oring with VEC(4). Result is stored in VEC(4). */ > + vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4) > + > + /* Seperate logic as we can only use testb for VEC_SIZE == 64. > + */ > +# if VEC_SIZE == 64 > + testb %dil, %dil > + js L(8x_last_2x_vec) > +# else > cmpl $(VEC_SIZE * 2), %edi > - jae L(8x_last_2x_vec) > + jge L(8x_last_2x_vec) > +# endif > > - VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 > - vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 > + VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2) > + vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2) > > - VMOVU (%rsi, %rdx), %YMM1 > - vpxorq (%rdx), %YMM1, %YMM1 > + VMOVU (%rsi, %rdx), %VMM(1) > + vpxorq (%rdx), %VMM(1), %VMM(1) > > - vpternlogd $0xfe, %YMM1, %YMM2, %YMM4 > + vpternlogd $0xfe, %VMM(1), %VMM(2), %VMM(4) > L(8x_last_1x_vec): > L(8x_last_2x_vec): > - VPTEST %YMM4, %YMM4, %k1 > - kmovd %k1, %eax > + VPTEST %VMM(4), %VMM(4), %k1 > + KMOV %k1, %VRAX > + TO_32BIT_P1 (rax) > L(return_neq2): > + TO_32BIT_P2 (rax) > ret > > - .p2align 4,, 8 > + .p2align 4,, 4 > L(last_2x_vec): > - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1 > - vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1 > - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2 > - vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2 > - VPTEST %YMM2, %YMM2, %k1 > - kmovd %k1, %eax > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(1) > + vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %VMM(1), %VMM(1) > + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2) > + vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %VMM(2) > + VPTEST %VMM(2), %VMM(2), %k1 > + KMOV %k1, %VRAX > + TO_32BIT (VRAX) > ret > > - /* 1 Bytes from next cache line. */ > + /* evex256: 1 Bytes from next cache line. evex512: 15 Bytes from > + next cache line. */ > END (MEMCMPEQ) > #endif > -- > 2.34.1 > LGTM. Thanks. -- H.J. ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S and minor changes 2022-10-31 15:48 ` H.J. Lu @ 2022-10-31 16:42 ` Noah Goldstein 0 siblings, 0 replies; 7+ messages in thread From: Noah Goldstein @ 2022-10-31 16:42 UTC (permalink / raw) To: H.J. Lu; +Cc: libc-alpha, carlos On Mon, Oct 31, 2022 at 10:48 AM H.J. Lu <hjl.tools@gmail.com> wrote: > > On Sat, Oct 29, 2022 at 1:20 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > > > Changes to generated code are: > > 1. In a few places use `vpcmpeqb` instead of `vpcmpneq` to save a > > byte of code size. > > 2. Add a branch for length <= (VEC_SIZE * 6) as opposed to doing > > the entire block of [VEC_SIZE * 4 + 1, VEC_SIZE * 8] in a > > single basic-block (the space to add the extra branch without > > changing code size is bought with the above change). > > > > Change (2) has roughly a 20-25% speedup for sizes in [VEC_SIZE * 4 + > > 1, VEC_SIZE * 6] and negligible to no-cost for [VEC_SIZE * 6 + 1, > > VEC_SIZE * 8] > > > > From N=10 runs on Tigerlake: > > > > align1,align2 ,length ,result ,New TIme ,Cur Time,New Time / Old Time > > 0 ,0 ,129 ,0 ,5.404 ,6.887 ,0.785 > > 0 ,0 ,129 ,1 ,5.308 ,6.826 ,0.778 > > 0 ,0 ,129 ,18446744073709551615 ,5.359 ,6.823 ,0.785 > > 0 ,0 ,161 ,0 ,5.284 ,6.827 ,0.774 > > 0 ,0 ,161 ,1 ,5.317 ,6.745 ,0.788 > > 0 ,0 ,161 ,18446744073709551615 ,5.406 ,6.778 ,0.798 > > > > 0 ,0 ,193 ,0 ,6.969 ,6.832 ,1.000 > > 0 ,0 ,193 ,1 ,6.943 ,6.748 ,1.029 > > 0 ,0 ,193 ,18446744073709551615 ,6.997 ,6.728 ,1.011 > > 0 ,0 ,225 ,0 ,7.144 ,6.746 ,0.989 > > 0 ,0 ,225 ,1 ,7.218 ,6.683 ,1.003 > > 0 ,0 ,225 ,18446744073709551615 ,6.864 ,6.767 ,0.992 > > 0 ,0 ,256 ,0 ,5.423 ,5.482 ,0.989 > > 0 ,0 ,256 ,1 ,5.348 ,5.465 ,0.978 > > 0 ,0 ,256 ,18446744073709551615 ,5.321 ,5.518 ,0.964 > > > > Rewriting with VMM API allows for memcmpeq-evex to be used with > > evex512 by including "x86-evex512-vecs.h" at the top. > > > > Complete check passes on x86-64. > > --- > > sysdeps/x86_64/multiarch/memcmpeq-evex.S | 255 ++++++++++++++--------- > > 1 file changed, 155 insertions(+), 100 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S > > index 41124ef1d3..671d19393e 100644 > > --- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S > > +++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S > > @@ -41,24 +41,53 @@ > > # define MEMCMPEQ __memcmpeq_evex > > # endif > > > > +# ifndef VEC_SIZE > > +# include "x86-evex512-vecs.h" > > +# endif > > +# include "reg-macros.h" > > + > > + > > +# if VEC_SIZE == 32 > > + > > +# define TEST_ZERO_VCMP(reg) inc %VGPR(reg) > > +# define TEST_ZERO(reg) test %VGPR(reg), %VGPR(reg) > > + > > +# define TO_32BIT_P1(reg) /* Do nothing. */ > > +# define TO_32BIT_P2(reg) /* Do nothing. */ > > +# define TO_32BIT(reg) /* Do nothing. */ > > + > > +# define VEC_CMP VPCMPEQ > > + > > +# elif VEC_SIZE == 64 > > + > > +# define TEST_ZERO_VCMP(reg) TEST_ZERO(reg) > > +# define TEST_ZERO(reg) neg %VGPR(reg) > > + > > + > > + /* VEC_SIZE == 64 needs to reduce the 64-bit mask to a 32-bit > > + int. We have two methods for this. If the mask with branched > > + on, we use `neg` for the branch then `sbb` to get the 32-bit > > + return. If the mask was no branched on, we just use > > + `popcntq`. */ > > +# define TO_32BIT_P1(reg) TEST_ZERO(reg) > > +# define TO_32BIT_P2(reg) sbb %VGPR_SZ(reg, 32), %VGPR_SZ(reg, 32) > > +# define TO_32BIT(reg) popcntq %reg, %reg > > + > > +# define VEC_CMP VPCMPNEQ > > + > > +# else > > +# error "Unsupported VEC_SIZE" > > +# endif > > + > > + > > # define VMOVU_MASK vmovdqu8 > > -# define VMOVU vmovdqu64 > > -# define VPCMP vpcmpub > > +# define VPCMPNEQ vpcmpneqb > > +# define VPCMPEQ vpcmpeqb > > # define VPTEST vptestmb > > > > -# define VEC_SIZE 32 > > # define PAGE_SIZE 4096 > > > > -# define YMM0 ymm16 > > -# define YMM1 ymm17 > > -# define YMM2 ymm18 > > -# define YMM3 ymm19 > > -# define YMM4 ymm20 > > -# define YMM5 ymm21 > > -# define YMM6 ymm22 > > - > > - > > - .section .text.evex, "ax", @progbits > > + .section SECTION(.text), "ax", @progbits > > ENTRY_P2ALIGN (MEMCMPEQ, 6) > > # ifdef __ILP32__ > > /* Clear the upper 32 bits. */ > > @@ -69,47 +98,54 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6) > > ja L(more_1x_vec) > > > > /* Create mask of bytes that are guranteed to be valid because > > - of length (edx). Using masked movs allows us to skip checks for > > - page crosses/zero size. */ > > - movl $-1, %ecx > > - bzhil %edx, %ecx, %ecx > > - kmovd %ecx, %k2 > > + of length (edx). Using masked movs allows us to skip checks > > + for page crosses/zero size. */ > > + mov $-1, %VRAX > > + bzhi %VRDX, %VRAX, %VRAX > > + /* NB: A `jz` might be useful here. Page-faults that are > > + invalidated by predicate execution (the evex mask) can be > > + very slow. The expectation is this is not the norm so and > > + "most" code will not regularly call 'memcmp' with length = 0 > > + and memory that is not wired up. */ > > + KMOV %VRAX, %k2 > > > > /* Use masked loads as VEC_SIZE could page cross where length > > (edx) would not. */ > > - VMOVU_MASK (%rsi), %YMM2{%k2} > > - VPCMP $4,(%rdi), %YMM2, %k1{%k2} > > - kmovd %k1, %eax > > + VMOVU_MASK (%rsi), %VMM(2){%k2}{z} > > + VPCMPNEQ (%rdi), %VMM(2), %k1{%k2} > > + KMOV %k1, %VRAX > > + TO_32BIT (VRAX) > > ret > > > > - > > + .p2align 4,, 3 > > L(last_1x_vec): > > - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1 > > - VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1 > > - kmovd %k1, %eax > > + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1) > > + VPCMPNEQ -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %k1 > > + KMOV %k1, %VRAX > > + TO_32BIT_P1 (rax) > > L(return_neq0): > > + TO_32BIT_P2 (rax) > > ret > > > > > > - > > - .p2align 4 > > + .p2align 4,, 12 > > L(more_1x_vec): > > /* From VEC + 1 to 2 * VEC. */ > > - VMOVU (%rsi), %YMM1 > > + VMOVU (%rsi), %VMM(1) > > /* Use compare not equals to directly check for mismatch. */ > > - VPCMP $4,(%rdi), %YMM1, %k1 > > - kmovd %k1, %eax > > - testl %eax, %eax > > + VPCMPNEQ (%rdi), %VMM(1), %k1 > > + KMOV %k1, %VRAX > > + TEST_ZERO (rax) > > jnz L(return_neq0) > > > > cmpq $(VEC_SIZE * 2), %rdx > > jbe L(last_1x_vec) > > > > /* Check second VEC no matter what. */ > > - VMOVU VEC_SIZE(%rsi), %YMM2 > > - VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1 > > - kmovd %k1, %eax > > - testl %eax, %eax > > + VMOVU VEC_SIZE(%rsi), %VMM(2) > > + VPCMPNEQ VEC_SIZE(%rdi), %VMM(2), %k1 > > + KMOV %k1, %VRAX > > + TEST_ZERO (rax) > > jnz L(return_neq0) > > > > /* Less than 4 * VEC. */ > > @@ -117,16 +153,16 @@ L(more_1x_vec): > > jbe L(last_2x_vec) > > > > /* Check third and fourth VEC no matter what. */ > > - VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 > > - VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1 > > - kmovd %k1, %eax > > - testl %eax, %eax > > + VMOVU (VEC_SIZE * 2)(%rsi), %VMM(3) > > + VEC_CMP (VEC_SIZE * 2)(%rdi), %VMM(3), %k1 > > + KMOV %k1, %VRAX > > + TEST_ZERO_VCMP (rax) > > jnz L(return_neq0) > > > > - VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 > > - VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1 > > - kmovd %k1, %eax > > - testl %eax, %eax > > + VMOVU (VEC_SIZE * 3)(%rsi), %VMM(4) > > + VEC_CMP (VEC_SIZE * 3)(%rdi), %VMM(4), %k1 > > + KMOV %k1, %VRAX > > + TEST_ZERO_VCMP (rax) > > jnz L(return_neq0) > > > > /* Go to 4x VEC loop. */ > > @@ -136,8 +172,8 @@ L(more_1x_vec): > > /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any > > branches. */ > > > > - VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %YMM1 > > - VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %YMM2 > > + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(1) > > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(2) > > addq %rdx, %rdi > > > > /* Wait to load from s1 until addressed adjust due to > > @@ -145,26 +181,32 @@ L(more_1x_vec): > > > > /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it > > will have some 1s. */ > > - vpxorq -(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1 > > - /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while > > - oring with YMM1. Result is stored in YMM1. */ > > - vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2 > > - > > - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM3 > > - vpxorq -(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 > > - /* Or together YMM1, YMM2, and YMM3 into YMM3. */ > > - VMOVU -(VEC_SIZE)(%rsi, %rdx), %YMM4 > > - vpxorq -(VEC_SIZE)(%rdi), %YMM4, %YMM4 > > - > > - /* Or together YMM2, YMM3, and YMM4 into YMM4. */ > > - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 > > - > > - /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */ > > - VPTEST %YMM4, %YMM4, %k1 > > - kmovd %k1, %eax > > + vpxorq -(VEC_SIZE * 1)(%rdi), %VMM(1), %VMM(1) > > + /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with VEC(2) while > > + oring with VEC(1). Result is stored in VEC(1). */ > > + vpternlogd $0xde, -(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(2) > > + > > + cmpl $(VEC_SIZE * 6), %edx > > + jbe L(4x_last_2x_vec) > > + > > + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VMM(3) > > + vpxorq -(VEC_SIZE * 3)(%rdi), %VMM(3), %VMM(3) > > + /* Or together VEC(1), VEC(2), and VEC(3) into VEC(3). */ > > + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VMM(4) > > + vpxorq -(VEC_SIZE * 4)(%rdi), %VMM(4), %VMM(4) > > + > > + /* Or together VEC(4), VEC(3), and VEC(2) into VEC(2). */ > > + vpternlogd $0xfe, %VMM(4), %VMM(3), %VMM(2) > > + > > + /* Compare VEC(4) with 0. If any 1s s1 and s2 don't match. */ > > +L(4x_last_2x_vec): > > + VPTEST %VMM(2), %VMM(2), %k1 > > + KMOV %k1, %VRAX > > + TO_32BIT (VRAX) > > ret > > > > - .p2align 4 > > + > > + .p2align 4,, 10 > > L(more_8x_vec): > > /* Set end of s1 in rdx. */ > > leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx > > @@ -175,67 +217,80 @@ L(more_8x_vec): > > andq $-VEC_SIZE, %rdi > > /* Adjust because first 4x vec where check already. */ > > subq $-(VEC_SIZE * 4), %rdi > > - .p2align 4 > > + .p2align 5,, 12 > > + .p2align 4,, 8 > > L(loop_4x_vec): > > - VMOVU (%rsi, %rdi), %YMM1 > > - vpxorq (%rdi), %YMM1, %YMM1 > > + VMOVU (%rsi, %rdi), %VMM(1) > > + vpxorq (%rdi), %VMM(1), %VMM(1) > > > > - VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 > > - vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2 > > + VMOVU VEC_SIZE(%rsi, %rdi), %VMM(2) > > + vpternlogd $0xde, (VEC_SIZE)(%rdi), %VMM(1), %VMM(2) > > > > - VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 > > - vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 > > + VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %VMM(3) > > + vpxorq (VEC_SIZE * 2)(%rdi), %VMM(3), %VMM(3) > > > > - VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 > > - vpxorq (VEC_SIZE * 3)(%rdi), %YMM4, %YMM4 > > + VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %VMM(4) > > + vpxorq (VEC_SIZE * 3)(%rdi), %VMM(4), %VMM(4) > > > > - vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 > > - VPTEST %YMM4, %YMM4, %k1 > > - kmovd %k1, %eax > > - testl %eax, %eax > > + vpternlogd $0xfe, %VMM(2), %VMM(3), %VMM(4) > > + VPTEST %VMM(4), %VMM(4), %k1 > > + KMOV %k1, %VRAX > > + TEST_ZERO (rax) > > jnz L(return_neq2) > > subq $-(VEC_SIZE * 4), %rdi > > cmpq %rdx, %rdi > > jb L(loop_4x_vec) > > > > subq %rdx, %rdi > > - VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 > > - vpxorq (VEC_SIZE * 3)(%rdx), %YMM4, %YMM4 > > + > > + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %VMM(4) > > + vpxorq (VEC_SIZE * 3)(%rdx), %VMM(4), %VMM(4) > > /* rdi has 4 * VEC_SIZE - remaining length. */ > > - cmpl $(VEC_SIZE * 3), %edi > > - jae L(8x_last_1x_vec) > > + > > /* Load regardless of branch. */ > > - VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3 > > - /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while > > - oring with YMM4. Result is stored in YMM4. */ > > - vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4 > > + VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %VMM(3) > > + /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with VEC(3) while > > + oring with VEC(4). Result is stored in VEC(4). */ > > + vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %VMM(3), %VMM(4) > > + > > + /* Seperate logic as we can only use testb for VEC_SIZE == 64. > > + */ > > +# if VEC_SIZE == 64 > > + testb %dil, %dil > > + js L(8x_last_2x_vec) > > +# else > > cmpl $(VEC_SIZE * 2), %edi > > - jae L(8x_last_2x_vec) > > + jge L(8x_last_2x_vec) > > +# endif > > > > - VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 > > - vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 > > + VMOVU VEC_SIZE(%rsi, %rdx), %VMM(2) > > + vpxorq VEC_SIZE(%rdx), %VMM(2), %VMM(2) > > > > - VMOVU (%rsi, %rdx), %YMM1 > > - vpxorq (%rdx), %YMM1, %YMM1 > > + VMOVU (%rsi, %rdx), %VMM(1) > > + vpxorq (%rdx), %VMM(1), %VMM(1) > > > > - vpternlogd $0xfe, %YMM1, %YMM2, %YMM4 > > + vpternlogd $0xfe, %VMM(1), %VMM(2), %VMM(4) > > L(8x_last_1x_vec): > > L(8x_last_2x_vec): > > - VPTEST %YMM4, %YMM4, %k1 > > - kmovd %k1, %eax > > + VPTEST %VMM(4), %VMM(4), %k1 > > + KMOV %k1, %VRAX > > + TO_32BIT_P1 (rax) > > L(return_neq2): > > + TO_32BIT_P2 (rax) > > ret > > > > - .p2align 4,, 8 > > + .p2align 4,, 4 > > L(last_2x_vec): > > - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1 > > - vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1 > > - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2 > > - vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2 > > - VPTEST %YMM2, %YMM2, %k1 > > - kmovd %k1, %eax > > + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VMM(1) > > + vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %VMM(1), %VMM(1) > > + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %VMM(2) > > + vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %VMM(1), %VMM(2) > > + VPTEST %VMM(2), %VMM(2), %k1 > > + KMOV %k1, %VRAX > > + TO_32BIT (VRAX) > > ret > > > > - /* 1 Bytes from next cache line. */ > > + /* evex256: 1 Bytes from next cache line. evex512: 15 Bytes from > > + next cache line. */ > > END (MEMCMPEQ) > > #endif > > -- > > 2.34.1 > > > > LGTM. > > Thanks. > > -- > H.J. Err realized I had messed up my extraction script when consolidating these numbers (ratios are correct, had pulled in the wrong fields when grabbing the actual times). Correct numbers are: align1,align2 ,length ,result ,memcmpeq-v2 ,current ,memcmpeq-v2/current 0 ,0 ,129 ,0 ,5.404 ,6.887 ,0.785 0 ,0 ,129 ,1 ,5.308 ,6.826 ,0.778 0 ,0 ,129 ,18446744073709551615 ,5.359 ,6.823 ,0.785 0 ,0 ,161 ,0 ,5.284 ,6.827 ,0.774 0 ,0 ,161 ,1 ,5.317 ,6.745 ,0.788 0 ,0 ,161 ,18446744073709551615 ,5.406 ,6.778 ,0.798 0 ,0 ,193 ,0 ,6.804 ,6.802 ,1.000 0 ,0 ,193 ,1 ,6.950 ,6.754 ,1.029 0 ,0 ,193 ,18446744073709551615 ,6.792 ,6.719 ,1.011 0 ,0 ,225 ,0 ,6.625 ,6.699 ,0.989 0 ,0 ,225 ,1 ,6.776 ,6.735 ,1.003 0 ,0 ,225 ,18446744073709551615 ,6.758 ,6.738 ,0.992 0 ,0 ,256 ,0 ,5.402 ,5.462 ,0.989 0 ,0 ,256 ,1 ,5.364 ,5.483 ,0.978 0 ,0 ,256 ,18446744073709551615 ,5.341 ,5.539 ,0.964 Will update the commit message before pushing. ^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests 2022-10-29 20:19 [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Noah Goldstein 2022-10-29 20:19 ` [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes Noah Goldstein 2022-10-29 20:19 ` [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S " Noah Goldstein @ 2022-10-31 13:19 ` Siddhesh Poyarekar 2 siblings, 0 replies; 7+ messages in thread From: Siddhesh Poyarekar @ 2022-10-31 13:19 UTC (permalink / raw) To: Noah Goldstein, libc-alpha LGTM. Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org> On 2022-10-29 16:19, Noah Goldstein via Libc-alpha wrote: > len=0 is valid and fairly common so should be tested. > --- > benchtests/bench-memcmp.c | 18 +++++++++--------- > string/test-memcmp.c | 16 ++++++++++------ > 2 files changed, 19 insertions(+), 15 deletions(-) > > diff --git a/benchtests/bench-memcmp.c b/benchtests/bench-memcmp.c > index d64eaa992e..b2816baebe 100644 > --- a/benchtests/bench-memcmp.c > +++ b/benchtests/bench-memcmp.c > @@ -63,7 +63,7 @@ IMPL (MEMCMP, 1) > > static void > do_one_test (json_ctx_t *json_ctx, impl_t *impl, const CHAR *s1, > - const CHAR *s2, size_t len, int exp_result) > + const CHAR *s2, size_t len) > { > size_t i, iters = INNER_LOOP_ITERS_LARGE; > timing_t start, stop, cur; > @@ -87,9 +87,6 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len, > size_t i; > CHAR *s1, *s2; > > - if (len == 0) > - return; > - > align1 &= (4096 - CHARBYTES); > if (align1 + (len + 1) * CHARBYTES >= page_size) > return; > @@ -111,13 +108,16 @@ do_test (json_ctx_t *json_ctx, size_t align1, size_t align2, size_t len, > for (i = 0; i < len; i++) > s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % MAX_CHAR; > > - s1[len] = align1; > - s2[len] = align2; > - s2[len - 1] -= exp_result; > + if (len) > + { > + s1[len] = align1; > + s2[len] = align2; > + s2[len - 1] -= exp_result; > + } > > FOR_EACH_IMPL (impl, 0) > { > - do_one_test (json_ctx, impl, s1, s2, len, exp_result); > + do_one_test (json_ctx, impl, s1, s2, len); > } > > json_array_end (json_ctx); > @@ -147,7 +147,7 @@ test_main (void) > json_array_end (&json_ctx); > > json_array_begin (&json_ctx, "results"); > - for (i = 1; i < 32; ++i) > + for (i = 0; i < 32; ++i) > { > do_test (&json_ctx, i * CHARBYTES, i * CHARBYTES, i, 0); > do_test (&json_ctx, i * CHARBYTES, i * CHARBYTES, i, 1); > diff --git a/string/test-memcmp.c b/string/test-memcmp.c > index 181b689f68..18d8b0d9f1 100644 > --- a/string/test-memcmp.c > +++ b/string/test-memcmp.c > @@ -117,9 +117,6 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result) > size_t i; > CHAR *s1, *s2; > > - if (len == 0) > - return; > - > align1 &= (4096 - CHARBYTES); > if (align1 + (len + 1) * CHARBYTES >= page_size) > return; > @@ -134,9 +131,16 @@ do_test (size_t align1, size_t align2, size_t len, int exp_result) > for (i = 0; i < len; i++) > s1[i] = s2[i] = 1 + (23 << ((CHARBYTES - 1) * 8)) * i % CHAR__MAX; > > - s1[len] = align1; > - s2[len] = align2; > - s2[len - 1] -= exp_result; > + if (len) > + { > + s1[len] = align1; > + s2[len] = align2; > + s2[len - 1] -= exp_result; > + } > + else > + { > + exp_result = 0; > + } > > FOR_EACH_IMPL (impl, 0) > do_one_test (impl, s1, s2, len, exp_result); ^ permalink raw reply [flat|nested] 7+ messages in thread
end of thread, other threads:[~2022-10-31 16:42 UTC | newest] Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2022-10-29 20:19 [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Noah Goldstein 2022-10-29 20:19 ` [PATCH v1 2/3] x86: Use VMM API in memcmp-evex-movbe.S and minor changes Noah Goldstein 2022-10-31 15:47 ` H.J. Lu 2022-10-29 20:19 ` [PATCH v1 3/3] x86: Use VMM API in memcmpeq-evex.S " Noah Goldstein 2022-10-31 15:48 ` H.J. Lu 2022-10-31 16:42 ` Noah Goldstein 2022-10-31 13:19 ` [PATCH v1 1/3] string: Add len=0 to {w}memcmp{eq} tests and benchtests Siddhesh Poyarekar
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).