* [PATCH v1 2/2] x86: Optimize L(less_vec) case in memcmpeq-evex.S
2021-12-25 3:22 [PATCH v1 1/2] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S Noah Goldstein
@ 2021-12-25 3:22 ` Noah Goldstein
2021-12-26 17:05 ` H.J. Lu
2021-12-25 3:24 ` [PATCH v1 1/2] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S Noah Goldstein
2021-12-26 17:04 ` H.J. Lu
2 siblings, 1 reply; 7+ messages in thread
From: Noah Goldstein @ 2021-12-25 3:22 UTC (permalink / raw)
To: libc-alpha
No bug.
Optimizations are twofold.
1) Replace page cross and 0/1 checks with masked load instructions in
L(less_vec). In applications this reduces branch-misses in the
hot [0, 32] case.
2) Change controlflow so that L(less_vec) case gets the fall through.
Change 2) helps copies in the [0, 32] size range but comes at the cost
of copies in the [33, 64] size range. From profiles of GCC and
Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
appears to the the right tradeoff.
---
sysdeps/x86_64/multiarch/memcmpeq-evex.S | 170 ++++++-----------------
1 file changed, 43 insertions(+), 127 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
index f27e732036..b5e1edbdff 100644
--- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S
+++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
@@ -39,6 +39,7 @@
# define MEMCMPEQ __memcmpeq_evex
# endif
+# define VMOVU_MASK vmovdqu8
# define VMOVU vmovdqu64
# define VPCMP vpcmpub
# define VPTEST vptestmb
@@ -62,12 +63,39 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
movl %edx, %edx
# endif
cmp $VEC_SIZE, %RDX_LP
- jb L(less_vec)
+ /* Fall through for [0, VEC_SIZE] as its the hottest. */
+ ja L(more_1x_vec)
+
+ /* Create mask of bytes that are guranteed to be valid because
+ of length (edx). Using masked movs allows us to skip checks for
+ page crosses/zero size. */
+ movl $-1, %ecx
+ bzhil %edx, %ecx, %ecx
+ kmovd %ecx, %k2
+
+ /* Use masked loads as VEC_SIZE could page cross where length
+ (edx) would not. */
+ VMOVU_MASK (%rsi), %YMM2{%k2}
+ VPCMP $4,(%rdi), %YMM2, %k1{%k2}
+ kmovd %k1, %eax
+ ret
- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
+
+L(last_1x_vec):
+ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
+ VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
+ kmovd %k1, %eax
+L(return_neq0):
+ ret
+
+
+
+ .p2align 4
+L(more_1x_vec):
+ /* From VEC + 1 to 2 * VEC. */
VMOVU (%rsi), %YMM1
/* Use compare not equals to directly check for mismatch. */
- VPCMP $4, (%rdi), %YMM1, %k1
+ VPCMP $4,(%rdi), %YMM1, %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_neq0)
@@ -88,13 +116,13 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
/* Check third and fourth VEC no matter what. */
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
- VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
+ VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_neq0)
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
- VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
+ VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_neq0)
@@ -132,66 +160,6 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
/* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
VPTEST %YMM4, %YMM4, %k1
kmovd %k1, %eax
-L(return_neq0):
- ret
-
- /* Fits in padding needed to .p2align 5 L(less_vec). */
-L(last_1x_vec):
- VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
- VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
- kmovd %k1, %eax
- ret
-
- /* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32
- byte aligned. */
- .p2align 5
-L(less_vec):
- /* Check if one or less char. This is necessary for size = 0 but
- is also faster for size = 1. */
- cmpl $1, %edx
- jbe L(one_or_less)
-
- /* Check if loading one VEC from either s1 or s2 could cause a
- page cross. This can have false positives but is by far the
- fastest method. */
- movl %edi, %eax
- orl %esi, %eax
- andl $(PAGE_SIZE - 1), %eax
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
- jg L(page_cross_less_vec)
-
- /* No page cross possible. */
- VMOVU (%rsi), %YMM2
- VPCMP $4, (%rdi), %YMM2, %k1
- kmovd %k1, %eax
- /* Result will be zero if s1 and s2 match. Otherwise first set
- bit will be first mismatch. */
- bzhil %edx, %eax, %eax
- ret
-
- /* Relatively cold but placing close to L(less_vec) for 2 byte
- jump encoding. */
- .p2align 4
-L(one_or_less):
- jb L(zero)
- movzbl (%rsi), %ecx
- movzbl (%rdi), %eax
- subl %ecx, %eax
- /* No ymm register was touched. */
- ret
- /* Within the same 16 byte block is L(one_or_less). */
-L(zero):
- xorl %eax, %eax
- ret
-
- .p2align 4
-L(last_2x_vec):
- VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
- vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
- VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
- vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
- VPTEST %YMM2, %YMM2, %k1
- kmovd %k1, %eax
ret
.p2align 4
@@ -211,7 +179,7 @@ L(loop_4x_vec):
vpxorq (%rdi), %YMM1, %YMM1
VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
- vpternlogd $0xde, (VEC_SIZE)(%rdi), %YMM1, %YMM2
+ vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
@@ -238,7 +206,7 @@ L(loop_4x_vec):
VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
/* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
oring with YMM4. Result is stored in YMM4. */
- vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
+ vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
cmpl $(VEC_SIZE * 2), %edi
jae L(8x_last_2x_vec)
@@ -256,68 +224,16 @@ L(8x_last_2x_vec):
L(return_neq2):
ret
- /* Relatively cold case as page cross are unexpected. */
- .p2align 4
-L(page_cross_less_vec):
- cmpl $16, %edx
- jae L(between_16_31)
- cmpl $8, %edx
- ja L(between_9_15)
- cmpl $4, %edx
- jb L(between_2_3)
- /* From 4 to 8 bytes. No branch when size == 4. */
- movl (%rdi), %eax
- subl (%rsi), %eax
- movl -4(%rdi, %rdx), %ecx
- movl -4(%rsi, %rdx), %edi
- subl %edi, %ecx
- orl %ecx, %eax
- ret
-
- .p2align 4,, 8
-L(between_16_31):
- /* From 16 to 31 bytes. No branch when size == 16. */
-
- /* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe.
- */
- vmovdqu (%rsi), %xmm1
- vpcmpeqb (%rdi), %xmm1, %xmm1
- vmovdqu -16(%rsi, %rdx), %xmm2
- vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2
- vpand %xmm1, %xmm2, %xmm2
- vpmovmskb %xmm2, %eax
- notw %ax
- /* No ymm register was touched. */
- ret
-
.p2align 4,, 8
-L(between_9_15):
- /* From 9 to 15 bytes. */
- movq (%rdi), %rax
- subq (%rsi), %rax
- movq -8(%rdi, %rdx), %rcx
- movq -8(%rsi, %rdx), %rdi
- subq %rdi, %rcx
- orq %rcx, %rax
- /* edx is guranteed to be a non-zero int. */
- cmovnz %edx, %eax
- ret
-
- /* Don't align. This is cold and aligning here will cause code
- to spill into next cache line. */
-L(between_2_3):
- /* From 2 to 3 bytes. No branch when size == 2. */
- movzwl (%rdi), %eax
- movzwl (%rsi), %ecx
- subl %ecx, %eax
- movzbl -1(%rdi, %rdx), %ecx
- /* All machines that support evex will insert a "merging uop"
- avoiding any serious partial register stalls. */
- subb -1(%rsi, %rdx), %cl
- orl %ecx, %eax
- /* No ymm register was touched. */
+L(last_2x_vec):
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
+ vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
+ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
+ vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
+ VPTEST %YMM2, %YMM2, %k1
+ kmovd %k1, %eax
ret
- /* 4 Bytes from next cache line. */
+ /* 1 Bytes from next cache line. */
END (MEMCMPEQ)
#endif
--
2.25.1
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v1 2/2] x86: Optimize L(less_vec) case in memcmpeq-evex.S
2021-12-25 3:22 ` [PATCH v1 2/2] x86: Optimize L(less_vec) case in memcmpeq-evex.S Noah Goldstein
@ 2021-12-26 17:05 ` H.J. Lu
0 siblings, 0 replies; 7+ messages in thread
From: H.J. Lu @ 2021-12-26 17:05 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Fri, Dec 24, 2021 at 7:23 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug.
> Optimizations are twofold.
>
> 1) Replace page cross and 0/1 checks with masked load instructions in
> L(less_vec). In applications this reduces branch-misses in the
> hot [0, 32] case.
> 2) Change controlflow so that L(less_vec) case gets the fall through.
>
> Change 2) helps copies in the [0, 32] size range but comes at the cost
> of copies in the [33, 64] size range. From profiles of GCC and
> Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
> appears to the the right tradeoff.
> ---
> sysdeps/x86_64/multiarch/memcmpeq-evex.S | 170 ++++++-----------------
> 1 file changed, 43 insertions(+), 127 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> index f27e732036..b5e1edbdff 100644
> --- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> +++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> @@ -39,6 +39,7 @@
> # define MEMCMPEQ __memcmpeq_evex
> # endif
>
> +# define VMOVU_MASK vmovdqu8
> # define VMOVU vmovdqu64
> # define VPCMP vpcmpub
> # define VPTEST vptestmb
> @@ -62,12 +63,39 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
> movl %edx, %edx
> # endif
> cmp $VEC_SIZE, %RDX_LP
> - jb L(less_vec)
> + /* Fall through for [0, VEC_SIZE] as its the hottest. */
> + ja L(more_1x_vec)
> +
> + /* Create mask of bytes that are guranteed to be valid because
> + of length (edx). Using masked movs allows us to skip checks for
> + page crosses/zero size. */
> + movl $-1, %ecx
> + bzhil %edx, %ecx, %ecx
> + kmovd %ecx, %k2
> +
> + /* Use masked loads as VEC_SIZE could page cross where length
> + (edx) would not. */
> + VMOVU_MASK (%rsi), %YMM2{%k2}
> + VPCMP $4,(%rdi), %YMM2, %k1{%k2}
> + kmovd %k1, %eax
> + ret
>
> - /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
> +
> +L(last_1x_vec):
> + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
> + VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
> + kmovd %k1, %eax
> +L(return_neq0):
> + ret
> +
> +
> +
> + .p2align 4
> +L(more_1x_vec):
> + /* From VEC + 1 to 2 * VEC. */
> VMOVU (%rsi), %YMM1
> /* Use compare not equals to directly check for mismatch. */
> - VPCMP $4, (%rdi), %YMM1, %k1
> + VPCMP $4,(%rdi), %YMM1, %k1
> kmovd %k1, %eax
> testl %eax, %eax
> jnz L(return_neq0)
> @@ -88,13 +116,13 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
>
> /* Check third and fourth VEC no matter what. */
> VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
> - VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
> + VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
> kmovd %k1, %eax
> testl %eax, %eax
> jnz L(return_neq0)
>
> VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
> - VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
> + VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
> kmovd %k1, %eax
> testl %eax, %eax
> jnz L(return_neq0)
> @@ -132,66 +160,6 @@ ENTRY_P2ALIGN (MEMCMPEQ, 6)
> /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
> VPTEST %YMM4, %YMM4, %k1
> kmovd %k1, %eax
> -L(return_neq0):
> - ret
> -
> - /* Fits in padding needed to .p2align 5 L(less_vec). */
> -L(last_1x_vec):
> - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
> - VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
> - kmovd %k1, %eax
> - ret
> -
> - /* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32
> - byte aligned. */
> - .p2align 5
> -L(less_vec):
> - /* Check if one or less char. This is necessary for size = 0 but
> - is also faster for size = 1. */
> - cmpl $1, %edx
> - jbe L(one_or_less)
> -
> - /* Check if loading one VEC from either s1 or s2 could cause a
> - page cross. This can have false positives but is by far the
> - fastest method. */
> - movl %edi, %eax
> - orl %esi, %eax
> - andl $(PAGE_SIZE - 1), %eax
> - cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> - jg L(page_cross_less_vec)
> -
> - /* No page cross possible. */
> - VMOVU (%rsi), %YMM2
> - VPCMP $4, (%rdi), %YMM2, %k1
> - kmovd %k1, %eax
> - /* Result will be zero if s1 and s2 match. Otherwise first set
> - bit will be first mismatch. */
> - bzhil %edx, %eax, %eax
> - ret
> -
> - /* Relatively cold but placing close to L(less_vec) for 2 byte
> - jump encoding. */
> - .p2align 4
> -L(one_or_less):
> - jb L(zero)
> - movzbl (%rsi), %ecx
> - movzbl (%rdi), %eax
> - subl %ecx, %eax
> - /* No ymm register was touched. */
> - ret
> - /* Within the same 16 byte block is L(one_or_less). */
> -L(zero):
> - xorl %eax, %eax
> - ret
> -
> - .p2align 4
> -L(last_2x_vec):
> - VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
> - vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
> - VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
> - vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
> - VPTEST %YMM2, %YMM2, %k1
> - kmovd %k1, %eax
> ret
>
> .p2align 4
> @@ -211,7 +179,7 @@ L(loop_4x_vec):
> vpxorq (%rdi), %YMM1, %YMM1
>
> VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
> - vpternlogd $0xde, (VEC_SIZE)(%rdi), %YMM1, %YMM2
> + vpternlogd $0xde,(VEC_SIZE)(%rdi), %YMM1, %YMM2
>
> VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
> vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> @@ -238,7 +206,7 @@ L(loop_4x_vec):
> VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
> /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
> oring with YMM4. Result is stored in YMM4. */
> - vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
> + vpternlogd $0xf6,(VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
> cmpl $(VEC_SIZE * 2), %edi
> jae L(8x_last_2x_vec)
>
> @@ -256,68 +224,16 @@ L(8x_last_2x_vec):
> L(return_neq2):
> ret
>
> - /* Relatively cold case as page cross are unexpected. */
> - .p2align 4
> -L(page_cross_less_vec):
> - cmpl $16, %edx
> - jae L(between_16_31)
> - cmpl $8, %edx
> - ja L(between_9_15)
> - cmpl $4, %edx
> - jb L(between_2_3)
> - /* From 4 to 8 bytes. No branch when size == 4. */
> - movl (%rdi), %eax
> - subl (%rsi), %eax
> - movl -4(%rdi, %rdx), %ecx
> - movl -4(%rsi, %rdx), %edi
> - subl %edi, %ecx
> - orl %ecx, %eax
> - ret
> -
> - .p2align 4,, 8
> -L(between_16_31):
> - /* From 16 to 31 bytes. No branch when size == 16. */
> -
> - /* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe.
> - */
> - vmovdqu (%rsi), %xmm1
> - vpcmpeqb (%rdi), %xmm1, %xmm1
> - vmovdqu -16(%rsi, %rdx), %xmm2
> - vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2
> - vpand %xmm1, %xmm2, %xmm2
> - vpmovmskb %xmm2, %eax
> - notw %ax
> - /* No ymm register was touched. */
> - ret
> -
> .p2align 4,, 8
> -L(between_9_15):
> - /* From 9 to 15 bytes. */
> - movq (%rdi), %rax
> - subq (%rsi), %rax
> - movq -8(%rdi, %rdx), %rcx
> - movq -8(%rsi, %rdx), %rdi
> - subq %rdi, %rcx
> - orq %rcx, %rax
> - /* edx is guranteed to be a non-zero int. */
> - cmovnz %edx, %eax
> - ret
> -
> - /* Don't align. This is cold and aligning here will cause code
> - to spill into next cache line. */
> -L(between_2_3):
> - /* From 2 to 3 bytes. No branch when size == 2. */
> - movzwl (%rdi), %eax
> - movzwl (%rsi), %ecx
> - subl %ecx, %eax
> - movzbl -1(%rdi, %rdx), %ecx
> - /* All machines that support evex will insert a "merging uop"
> - avoiding any serious partial register stalls. */
> - subb -1(%rsi, %rdx), %cl
> - orl %ecx, %eax
> - /* No ymm register was touched. */
> +L(last_2x_vec):
> + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
> + vpxorq -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
> + VMOVU -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
> + vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
> + VPTEST %YMM2, %YMM2, %k1
> + kmovd %k1, %eax
> ret
>
> - /* 4 Bytes from next cache line. */
> + /* 1 Bytes from next cache line. */
> END (MEMCMPEQ)
> #endif
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v1 1/2] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
2021-12-25 3:22 [PATCH v1 1/2] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S Noah Goldstein
2021-12-25 3:22 ` [PATCH v1 2/2] x86: Optimize L(less_vec) case in memcmpeq-evex.S Noah Goldstein
@ 2021-12-25 3:24 ` Noah Goldstein
2021-12-26 17:04 ` H.J. Lu
2 siblings, 0 replies; 7+ messages in thread
From: Noah Goldstein @ 2021-12-25 3:24 UTC (permalink / raw)
To: GNU C Library
[-- Attachment #1: Type: text/plain, Size: 14442 bytes --]
On Fri, Dec 24, 2021 at 9:23 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug.
> Optimizations are twofold.
>
> 1) Replace page cross and 0/1 checks with masked load instructions in
> L(less_vec). In applications this reduces branch-misses in the
> hot [0, 32] case.
> 2) Change controlflow so that L(less_vec) case gets the fall through.
>
> Change 2) helps copies in the [0, 32] size range but comes at the cost
> of copies in the [33, 64] size range. From profiles of GCC and
> Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
> appears to the the right tradeoff.
> ---
> sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 249 +++++--------------
> 1 file changed, 56 insertions(+), 193 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> index 640f6757fa..d2899e7c70 100644
> --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> @@ -62,15 +62,18 @@ Latency:
> # define VMOVU vmovdqu64
>
> # ifdef USE_AS_WMEMCMP
> +# define VMOVU_MASK vmovdqu32
> # define CHAR_SIZE 4
> # define VPCMP vpcmpd
> # define VPTEST vptestmd
> # else
> +# define VMOVU_MASK vmovdqu8
> # define CHAR_SIZE 1
> # define VPCMP vpcmpub
> # define VPTEST vptestmb
> # endif
>
> +
> # define VEC_SIZE 32
> # define PAGE_SIZE 4096
> # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> @@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)
> movl %edx, %edx
> # endif
> cmp $CHAR_PER_VEC, %RDX_LP
> - jb L(less_vec)
> + /* Fall through for [0, VEC_SIZE] as its the hottest. */
> + ja L(more_1x_vec)
> +
> + /* Create mask for CHAR's we want to compare. This allows us to
> + avoid having to include page cross logic. */
> + movl $-1, %ecx
> + bzhil %edx, %ecx, %ecx
> + kmovd %ecx, %k2
> +
> + /* Safe to load full ymm with mask. */
> + VMOVU_MASK (%rsi), %YMM2{%k2}
> + VPCMP $4,(%rdi), %YMM2, %k1{%k2}
> + kmovd %k1, %eax
> + testl %eax, %eax
> + jnz L(return_vec_0)
> + ret
>
> + .p2align 4
> +L(return_vec_0):
> + tzcntl %eax, %eax
> +# ifdef USE_AS_WMEMCMP
> + movl (%rdi, %rax, CHAR_SIZE), %ecx
> + xorl %edx, %edx
> + cmpl (%rsi, %rax, CHAR_SIZE), %ecx
> + /* NB: no partial register stall here because xorl zero idiom
> + above. */
> + setg %dl
> + leal -1(%rdx, %rdx), %eax
> +# else
> + movzbl (%rsi, %rax), %ecx
> + movzbl (%rdi, %rax), %eax
> + subl %ecx, %eax
> +# endif
> + ret
> +
> +
> + .p2align 4
> +L(more_1x_vec):
> /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
> VMOVU (%rsi), %YMM1
> /* Use compare not equals to directly check for mismatch. */
> - VPCMP $4, (%rdi), %YMM1, %k1
> + VPCMP $4,(%rdi), %YMM1, %k1
> kmovd %k1, %eax
> /* NB: eax must be destination register if going to
> L(return_vec_[0,2]). For L(return_vec_3) destination register
> @@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)
>
> /* Check third and fourth VEC no matter what. */
> VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
> - VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
> + VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
> kmovd %k1, %eax
> testl %eax, %eax
> jnz L(return_vec_2)
>
> VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
> - VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
> + VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
> kmovd %k1, %ecx
> testl %ecx, %ecx
> jnz L(return_vec_3)
> @@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
> VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
> /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
> oring with YMM1. Result is stored in YMM4. */
> - vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> + vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
>
> /* Or together YMM2, YMM3, and YMM4 into YMM4. */
> vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> @@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
> /* NB: eax must be zero to reach here. */
> ret
>
> - .p2align 4
> +
> + .p2align 4,, 8
> L(8x_end_return_vec_0_1_2_3):
> movq %rdx, %rdi
> L(8x_return_vec_0_1_2_3):
> @@ -222,23 +262,6 @@ L(return_vec_3):
> # endif
> ret
>
> - .p2align 4
> -L(return_vec_0):
> - tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCMP
> - movl (%rdi, %rax, CHAR_SIZE), %ecx
> - xorl %edx, %edx
> - cmpl (%rsi, %rax, CHAR_SIZE), %ecx
> - /* NB: no partial register stall here because xorl zero idiom
> - above. */
> - setg %dl
> - leal -1(%rdx, %rdx), %eax
> -# else
> - movzbl (%rsi, %rax), %ecx
> - movzbl (%rdi, %rax), %eax
> - subl %ecx, %eax
> -# endif
> - ret
>
> .p2align 4
> L(return_vec_1):
> @@ -297,7 +320,7 @@ L(loop_4x_vec):
> VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
> vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
> - vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> + vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> VPTEST %YMM4, %YMM4, %k1
> kmovd %k1, %ecx
> @@ -324,7 +347,7 @@ L(loop_4x_vec):
> VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
> vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
> VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
> - vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
> + vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
> vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> VPTEST %YMM4, %YMM4, %k1
> kmovd %k1, %ecx
> @@ -336,14 +359,14 @@ L(loop_4x_vec):
> /* Only entry is from L(more_8x_vec). */
> .p2align 4,, 10
> L(8x_last_2x_vec):
> - VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
> + VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
> kmovd %k1, %eax
> testl %eax, %eax
> jnz L(8x_return_vec_2)
> /* Naturally aligned to 16 bytes. */
> L(8x_last_1x_vec):
> VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
> - VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
> + VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
> kmovd %k1, %eax
> testl %eax, %eax
> jnz L(8x_return_vec_3)
> @@ -392,7 +415,9 @@ L(last_1x_vec):
> jnz L(return_vec_0_end)
> ret
>
> - .p2align 4,, 10
> +
> + /* Don't align. Takes 2-fetch blocks either way and aligning
> + will cause code to spill into another cacheline. */
> L(return_vec_1_end):
> /* Use bsf to save code size. This is necessary to have
> L(one_or_less) fit in aligning bytes between. */
> @@ -411,31 +436,8 @@ L(return_vec_1_end):
> # endif
> ret
>
> - /* NB: L(one_or_less) fits in alignment padding between
> - L(return_vec_1_end) and L(return_vec_0_end). */
> -# ifdef USE_AS_WMEMCMP
> -L(one_or_less):
> - jb L(zero)
> - movl (%rdi), %ecx
> - xorl %edx, %edx
> - cmpl (%rsi), %ecx
> - je L(zero)
> - setg %dl
> - leal -1(%rdx, %rdx), %eax
> - ret
> -# else
> -L(one_or_less):
> - jb L(zero)
> - movzbl (%rsi), %ecx
> - movzbl (%rdi), %eax
> - subl %ecx, %eax
> - ret
> -# endif
> -L(zero):
> - xorl %eax, %eax
> - ret
> -
> - .p2align 4
> + /* Don't align. Takes 2-fetch blocks either way and aligning
> + will cause code to spill into another cacheline. */
> L(return_vec_0_end):
> tzcntl %eax, %eax
> addl %edx, %eax
> @@ -451,146 +453,7 @@ L(return_vec_0_end):
> subl %ecx, %eax
> # endif
> ret
> + /* 1-byte until next cache line. */
>
> - .p2align 4
> -L(less_vec):
> - /* Check if one or less CHAR. This is necessary for size == 0
> - but is also faster for size == CHAR_SIZE. */
> - cmpl $1, %edx
> - jbe L(one_or_less)
> -
> - /* Check if loading one VEC from either s1 or s2 could cause a
> - page cross. This can have false positives but is by far the
> - fastest method. */
> - movl %edi, %eax
> - orl %esi, %eax
> - andl $(PAGE_SIZE - 1), %eax
> - cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> - jg L(page_cross_less_vec)
> -
> - /* No page cross possible. */
> - VMOVU (%rsi), %YMM2
> - VPCMP $4, (%rdi), %YMM2, %k1
> - kmovd %k1, %eax
> - /* Check if any matches where in bounds. Intentionally not
> - storing result in eax to limit dependency chain if it goes to
> - L(return_vec_0_lv). */
> - bzhil %edx, %eax, %edx
> - jnz L(return_vec_0_lv)
> - xorl %eax, %eax
> - ret
> -
> - /* Essentially duplicate of L(return_vec_0). Ends up not costing
> - any code as shrinks L(less_vec) by allowing 2-byte encoding of
> - the jump and ends up fitting in aligning bytes. As well fits on
> - same cache line as L(less_vec) so also saves a line from having
> - to be fetched on cold calls to memcmp. */
> - .p2align 4,, 4
> -L(return_vec_0_lv):
> - tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCMP
> - movl (%rdi, %rax, CHAR_SIZE), %ecx
> - xorl %edx, %edx
> - cmpl (%rsi, %rax, CHAR_SIZE), %ecx
> - /* NB: no partial register stall here because xorl zero idiom
> - above. */
> - setg %dl
> - leal -1(%rdx, %rdx), %eax
> -# else
> - movzbl (%rsi, %rax), %ecx
> - movzbl (%rdi, %rax), %eax
> - subl %ecx, %eax
> -# endif
> - ret
> -
> - .p2align 4
> -L(page_cross_less_vec):
> - /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
> - bytes. */
> - cmpl $(16 / CHAR_SIZE), %edx
> - jae L(between_16_31)
> -# ifndef USE_AS_WMEMCMP
> - cmpl $8, %edx
> - jae L(between_8_15)
> - cmpl $4, %edx
> - jb L(between_2_3)
> -
> - /* Load as big endian with overlapping movbe to avoid branches.
> - */
> - movbe (%rdi), %eax
> - movbe (%rsi), %ecx
> - shlq $32, %rax
> - shlq $32, %rcx
> - movbe -4(%rdi, %rdx), %edi
> - movbe -4(%rsi, %rdx), %esi
> - orq %rdi, %rax
> - orq %rsi, %rcx
> - subq %rcx, %rax
> - /* edx is guranteed to be positive int32 in range [4, 7]. */
> - cmovne %edx, %eax
> - /* ecx is -1 if rcx > rax. Otherwise 0. */
> - sbbl %ecx, %ecx
> - /* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
> - rax then eax and ecx are zero. If rax < rax then ecx is -1 so
> - eax doesn't matter. */
> - orl %ecx, %eax
> - ret
> -
> - .p2align 4,, 8
> -L(between_8_15):
> -# endif
> - /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
> - vmovq (%rdi), %xmm1
> - vmovq (%rsi), %xmm2
> - VPCMP $4, %xmm1, %xmm2, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(return_vec_0_lv)
> - /* Use overlapping loads to avoid branches. */
> - vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1
> - vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2
> - VPCMP $4, %xmm1, %xmm2, %k1
> - addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(return_vec_0_end)
> - ret
> -
> - .p2align 4,, 8
> -L(between_16_31):
> - /* From 16 to 31 bytes. No branch when size == 16. */
> -
> - /* Use movups to save code size. */
> - vmovdqu (%rsi), %xmm2
> - VPCMP $4, (%rdi), %xmm2, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(return_vec_0_lv)
> - /* Use overlapping loads to avoid branches. */
> - vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
> - VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
> - addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(return_vec_0_end)
> - ret
> -
> -# ifndef USE_AS_WMEMCMP
> -L(between_2_3):
> - /* Load as big endian to avoid branches. */
> - movzwl (%rdi), %eax
> - movzwl (%rsi), %ecx
> - shll $8, %eax
> - shll $8, %ecx
> - bswap %eax
> - bswap %ecx
> - movzbl -1(%rdi, %rdx), %edi
> - movzbl -1(%rsi, %rdx), %esi
> - orl %edi, %eax
> - orl %esi, %ecx
> - /* Subtraction is okay because the upper 8 bits are zero. */
> - subl %ecx, %eax
> - ret
> -# endif
> END (MEMCMP)
> #endif
> --
> 2.25.1
>
Numbers attached here.
Numbers collected on Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
Numbers geomean of N = 20 runs.
Times reported as (new T) / (old T).
Some notes, there signficant improvement in the [0, 32] sized
case. Roughly 33-37% improvement for the page cross case (including false
positives) and 17-20% for the normal case.
On the other hand there are fairly significant slowdowns in the [33,
64] sized cases. For memcmpeq these regressions are roughly 5-20%. For
memcmp these regressions are around 10-15%.
In general I believe this a workwhile tradeoff.
As stated in the commit message, a profile of GCC compiling itself or
Python3 running the pyperf benchmark suite indicates that 94%+ or 99%+
of memcmp calls respectively fall into the [0, 32] size range.
Since the speedup in the [0, 32] case is greater than the regression
in the [33, 64] case and the [0, 32] case covers the vast majority of
cases, I think we can expect this change to benefit applications.
If people think otherwise, I have patches that show no regressions but
more modest speedups for the [0, 32] case (0-5% normal and 15-20% for
page cross case)
[-- Attachment #2: memcmp-numbers.pdf --]
[-- Type: application/pdf, Size: 113103 bytes --]
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v1 1/2] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
2021-12-25 3:22 [PATCH v1 1/2] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S Noah Goldstein
2021-12-25 3:22 ` [PATCH v1 2/2] x86: Optimize L(less_vec) case in memcmpeq-evex.S Noah Goldstein
2021-12-25 3:24 ` [PATCH v1 1/2] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S Noah Goldstein
@ 2021-12-26 17:04 ` H.J. Lu
2021-12-27 10:08 ` Noah Goldstein
2 siblings, 1 reply; 7+ messages in thread
From: H.J. Lu @ 2021-12-26 17:04 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Fri, Dec 24, 2021 at 7:23 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug.
> Optimizations are twofold.
>
> 1) Replace page cross and 0/1 checks with masked load instructions in
> L(less_vec). In applications this reduces branch-misses in the
> hot [0, 32] case.
> 2) Change controlflow so that L(less_vec) case gets the fall through.
>
> Change 2) helps copies in the [0, 32] size range but comes at the cost
> of copies in the [33, 64] size range. From profiles of GCC and
> Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
> appears to the the right tradeoff.
> ---
> sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 249 +++++--------------
> 1 file changed, 56 insertions(+), 193 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> index 640f6757fa..d2899e7c70 100644
> --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> @@ -62,15 +62,18 @@ Latency:
> # define VMOVU vmovdqu64
>
> # ifdef USE_AS_WMEMCMP
> +# define VMOVU_MASK vmovdqu32
> # define CHAR_SIZE 4
> # define VPCMP vpcmpd
> # define VPTEST vptestmd
> # else
> +# define VMOVU_MASK vmovdqu8
> # define CHAR_SIZE 1
> # define VPCMP vpcmpub
> # define VPTEST vptestmb
> # endif
>
> +
> # define VEC_SIZE 32
> # define PAGE_SIZE 4096
> # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> @@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)
> movl %edx, %edx
> # endif
> cmp $CHAR_PER_VEC, %RDX_LP
> - jb L(less_vec)
> + /* Fall through for [0, VEC_SIZE] as its the hottest. */
> + ja L(more_1x_vec)
> +
> + /* Create mask for CHAR's we want to compare. This allows us to
> + avoid having to include page cross logic. */
> + movl $-1, %ecx
> + bzhil %edx, %ecx, %ecx
> + kmovd %ecx, %k2
> +
> + /* Safe to load full ymm with mask. */
> + VMOVU_MASK (%rsi), %YMM2{%k2}
> + VPCMP $4,(%rdi), %YMM2, %k1{%k2}
> + kmovd %k1, %eax
> + testl %eax, %eax
> + jnz L(return_vec_0)
> + ret
>
> + .p2align 4
> +L(return_vec_0):
> + tzcntl %eax, %eax
> +# ifdef USE_AS_WMEMCMP
> + movl (%rdi, %rax, CHAR_SIZE), %ecx
> + xorl %edx, %edx
> + cmpl (%rsi, %rax, CHAR_SIZE), %ecx
> + /* NB: no partial register stall here because xorl zero idiom
> + above. */
> + setg %dl
> + leal -1(%rdx, %rdx), %eax
> +# else
> + movzbl (%rsi, %rax), %ecx
> + movzbl (%rdi, %rax), %eax
> + subl %ecx, %eax
> +# endif
> + ret
> +
> +
> + .p2align 4
> +L(more_1x_vec):
> /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
> VMOVU (%rsi), %YMM1
> /* Use compare not equals to directly check for mismatch. */
> - VPCMP $4, (%rdi), %YMM1, %k1
> + VPCMP $4,(%rdi), %YMM1, %k1
> kmovd %k1, %eax
> /* NB: eax must be destination register if going to
> L(return_vec_[0,2]). For L(return_vec_3) destination register
> @@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)
>
> /* Check third and fourth VEC no matter what. */
> VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
> - VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
> + VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
> kmovd %k1, %eax
> testl %eax, %eax
> jnz L(return_vec_2)
>
> VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
> - VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
> + VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
> kmovd %k1, %ecx
> testl %ecx, %ecx
> jnz L(return_vec_3)
> @@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
> VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
> /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
> oring with YMM1. Result is stored in YMM4. */
> - vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> + vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
>
> /* Or together YMM2, YMM3, and YMM4 into YMM4. */
> vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> @@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
> /* NB: eax must be zero to reach here. */
> ret
>
> - .p2align 4
> +
> + .p2align 4,, 8
> L(8x_end_return_vec_0_1_2_3):
> movq %rdx, %rdi
> L(8x_return_vec_0_1_2_3):
> @@ -222,23 +262,6 @@ L(return_vec_3):
> # endif
> ret
>
> - .p2align 4
> -L(return_vec_0):
> - tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCMP
> - movl (%rdi, %rax, CHAR_SIZE), %ecx
> - xorl %edx, %edx
> - cmpl (%rsi, %rax, CHAR_SIZE), %ecx
> - /* NB: no partial register stall here because xorl zero idiom
> - above. */
> - setg %dl
> - leal -1(%rdx, %rdx), %eax
> -# else
> - movzbl (%rsi, %rax), %ecx
> - movzbl (%rdi, %rax), %eax
> - subl %ecx, %eax
> -# endif
> - ret
>
> .p2align 4
> L(return_vec_1):
> @@ -297,7 +320,7 @@ L(loop_4x_vec):
> VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
> vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
> - vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> + vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> VPTEST %YMM4, %YMM4, %k1
> kmovd %k1, %ecx
> @@ -324,7 +347,7 @@ L(loop_4x_vec):
> VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
> vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
> VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
> - vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
> + vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
> vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> VPTEST %YMM4, %YMM4, %k1
> kmovd %k1, %ecx
> @@ -336,14 +359,14 @@ L(loop_4x_vec):
> /* Only entry is from L(more_8x_vec). */
> .p2align 4,, 10
> L(8x_last_2x_vec):
> - VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
> + VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
> kmovd %k1, %eax
> testl %eax, %eax
> jnz L(8x_return_vec_2)
> /* Naturally aligned to 16 bytes. */
> L(8x_last_1x_vec):
> VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
> - VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
> + VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
> kmovd %k1, %eax
> testl %eax, %eax
> jnz L(8x_return_vec_3)
> @@ -392,7 +415,9 @@ L(last_1x_vec):
> jnz L(return_vec_0_end)
> ret
>
> - .p2align 4,, 10
> +
> + /* Don't align. Takes 2-fetch blocks either way and aligning
> + will cause code to spill into another cacheline. */
> L(return_vec_1_end):
> /* Use bsf to save code size. This is necessary to have
> L(one_or_less) fit in aligning bytes between. */
> @@ -411,31 +436,8 @@ L(return_vec_1_end):
> # endif
> ret
>
> - /* NB: L(one_or_less) fits in alignment padding between
> - L(return_vec_1_end) and L(return_vec_0_end). */
> -# ifdef USE_AS_WMEMCMP
> -L(one_or_less):
> - jb L(zero)
> - movl (%rdi), %ecx
> - xorl %edx, %edx
> - cmpl (%rsi), %ecx
> - je L(zero)
> - setg %dl
> - leal -1(%rdx, %rdx), %eax
> - ret
> -# else
> -L(one_or_less):
> - jb L(zero)
> - movzbl (%rsi), %ecx
> - movzbl (%rdi), %eax
> - subl %ecx, %eax
> - ret
> -# endif
> -L(zero):
> - xorl %eax, %eax
> - ret
> -
> - .p2align 4
> + /* Don't align. Takes 2-fetch blocks either way and aligning
> + will cause code to spill into another cacheline. */
> L(return_vec_0_end):
> tzcntl %eax, %eax
> addl %edx, %eax
> @@ -451,146 +453,7 @@ L(return_vec_0_end):
> subl %ecx, %eax
> # endif
> ret
> + /* 1-byte until next cache line. */
>
> - .p2align 4
> -L(less_vec):
> - /* Check if one or less CHAR. This is necessary for size == 0
> - but is also faster for size == CHAR_SIZE. */
> - cmpl $1, %edx
> - jbe L(one_or_less)
> -
> - /* Check if loading one VEC from either s1 or s2 could cause a
> - page cross. This can have false positives but is by far the
> - fastest method. */
> - movl %edi, %eax
> - orl %esi, %eax
> - andl $(PAGE_SIZE - 1), %eax
> - cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> - jg L(page_cross_less_vec)
> -
> - /* No page cross possible. */
> - VMOVU (%rsi), %YMM2
> - VPCMP $4, (%rdi), %YMM2, %k1
> - kmovd %k1, %eax
> - /* Check if any matches where in bounds. Intentionally not
> - storing result in eax to limit dependency chain if it goes to
> - L(return_vec_0_lv). */
> - bzhil %edx, %eax, %edx
> - jnz L(return_vec_0_lv)
> - xorl %eax, %eax
> - ret
> -
> - /* Essentially duplicate of L(return_vec_0). Ends up not costing
> - any code as shrinks L(less_vec) by allowing 2-byte encoding of
> - the jump and ends up fitting in aligning bytes. As well fits on
> - same cache line as L(less_vec) so also saves a line from having
> - to be fetched on cold calls to memcmp. */
> - .p2align 4,, 4
> -L(return_vec_0_lv):
> - tzcntl %eax, %eax
> -# ifdef USE_AS_WMEMCMP
> - movl (%rdi, %rax, CHAR_SIZE), %ecx
> - xorl %edx, %edx
> - cmpl (%rsi, %rax, CHAR_SIZE), %ecx
> - /* NB: no partial register stall here because xorl zero idiom
> - above. */
> - setg %dl
> - leal -1(%rdx, %rdx), %eax
> -# else
> - movzbl (%rsi, %rax), %ecx
> - movzbl (%rdi, %rax), %eax
> - subl %ecx, %eax
> -# endif
> - ret
> -
> - .p2align 4
> -L(page_cross_less_vec):
> - /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
> - bytes. */
> - cmpl $(16 / CHAR_SIZE), %edx
> - jae L(between_16_31)
> -# ifndef USE_AS_WMEMCMP
> - cmpl $8, %edx
> - jae L(between_8_15)
> - cmpl $4, %edx
> - jb L(between_2_3)
> -
> - /* Load as big endian with overlapping movbe to avoid branches.
> - */
> - movbe (%rdi), %eax
> - movbe (%rsi), %ecx
> - shlq $32, %rax
> - shlq $32, %rcx
> - movbe -4(%rdi, %rdx), %edi
> - movbe -4(%rsi, %rdx), %esi
> - orq %rdi, %rax
> - orq %rsi, %rcx
> - subq %rcx, %rax
> - /* edx is guranteed to be positive int32 in range [4, 7]. */
> - cmovne %edx, %eax
> - /* ecx is -1 if rcx > rax. Otherwise 0. */
> - sbbl %ecx, %ecx
> - /* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
> - rax then eax and ecx are zero. If rax < rax then ecx is -1 so
> - eax doesn't matter. */
> - orl %ecx, %eax
> - ret
> -
> - .p2align 4,, 8
> -L(between_8_15):
> -# endif
> - /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
> - vmovq (%rdi), %xmm1
> - vmovq (%rsi), %xmm2
> - VPCMP $4, %xmm1, %xmm2, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(return_vec_0_lv)
> - /* Use overlapping loads to avoid branches. */
> - vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1
> - vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2
> - VPCMP $4, %xmm1, %xmm2, %k1
> - addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(return_vec_0_end)
> - ret
> -
> - .p2align 4,, 8
> -L(between_16_31):
> - /* From 16 to 31 bytes. No branch when size == 16. */
> -
> - /* Use movups to save code size. */
> - vmovdqu (%rsi), %xmm2
> - VPCMP $4, (%rdi), %xmm2, %k1
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(return_vec_0_lv)
> - /* Use overlapping loads to avoid branches. */
> - vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
> - VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
> - addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
> - kmovd %k1, %eax
> - testl %eax, %eax
> - jnz L(return_vec_0_end)
> - ret
> -
> -# ifndef USE_AS_WMEMCMP
> -L(between_2_3):
> - /* Load as big endian to avoid branches. */
> - movzwl (%rdi), %eax
> - movzwl (%rsi), %ecx
> - shll $8, %eax
> - shll $8, %ecx
> - bswap %eax
> - bswap %ecx
> - movzbl -1(%rdi, %rdx), %edi
> - movzbl -1(%rsi, %rdx), %esi
> - orl %edi, %eax
> - orl %esi, %ecx
> - /* Subtraction is okay because the upper 8 bits are zero. */
> - subl %ecx, %eax
> - ret
> -# endif
> END (MEMCMP)
> #endif
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v1 1/2] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
2021-12-26 17:04 ` H.J. Lu
@ 2021-12-27 10:08 ` Noah Goldstein
2022-04-23 1:53 ` Sunil Pandey
0 siblings, 1 reply; 7+ messages in thread
From: Noah Goldstein @ 2021-12-27 10:08 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
On Sun, Dec 26, 2021 at 11:05 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Dec 24, 2021 at 7:23 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > No bug.
> > Optimizations are twofold.
> >
> > 1) Replace page cross and 0/1 checks with masked load instructions in
> > L(less_vec). In applications this reduces branch-misses in the
> > hot [0, 32] case.
> > 2) Change controlflow so that L(less_vec) case gets the fall through.
> >
> > Change 2) helps copies in the [0, 32] size range but comes at the cost
> > of copies in the [33, 64] size range. From profiles of GCC and
> > Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
> > appears to the the right tradeoff.
> > ---
> > sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 249 +++++--------------
> > 1 file changed, 56 insertions(+), 193 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> > index 640f6757fa..d2899e7c70 100644
> > --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> > +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> > @@ -62,15 +62,18 @@ Latency:
> > # define VMOVU vmovdqu64
> >
> > # ifdef USE_AS_WMEMCMP
> > +# define VMOVU_MASK vmovdqu32
> > # define CHAR_SIZE 4
> > # define VPCMP vpcmpd
> > # define VPTEST vptestmd
> > # else
> > +# define VMOVU_MASK vmovdqu8
> > # define CHAR_SIZE 1
> > # define VPCMP vpcmpub
> > # define VPTEST vptestmb
> > # endif
> >
> > +
> > # define VEC_SIZE 32
> > # define PAGE_SIZE 4096
> > # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> > @@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)
> > movl %edx, %edx
> > # endif
> > cmp $CHAR_PER_VEC, %RDX_LP
> > - jb L(less_vec)
> > + /* Fall through for [0, VEC_SIZE] as its the hottest. */
> > + ja L(more_1x_vec)
> > +
> > + /* Create mask for CHAR's we want to compare. This allows us to
> > + avoid having to include page cross logic. */
> > + movl $-1, %ecx
> > + bzhil %edx, %ecx, %ecx
> > + kmovd %ecx, %k2
> > +
> > + /* Safe to load full ymm with mask. */
> > + VMOVU_MASK (%rsi), %YMM2{%k2}
> > + VPCMP $4,(%rdi), %YMM2, %k1{%k2}
> > + kmovd %k1, %eax
> > + testl %eax, %eax
> > + jnz L(return_vec_0)
> > + ret
> >
> > + .p2align 4
> > +L(return_vec_0):
> > + tzcntl %eax, %eax
> > +# ifdef USE_AS_WMEMCMP
> > + movl (%rdi, %rax, CHAR_SIZE), %ecx
> > + xorl %edx, %edx
> > + cmpl (%rsi, %rax, CHAR_SIZE), %ecx
> > + /* NB: no partial register stall here because xorl zero idiom
> > + above. */
> > + setg %dl
> > + leal -1(%rdx, %rdx), %eax
> > +# else
> > + movzbl (%rsi, %rax), %ecx
> > + movzbl (%rdi, %rax), %eax
> > + subl %ecx, %eax
> > +# endif
> > + ret
> > +
> > +
> > + .p2align 4
> > +L(more_1x_vec):
> > /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
> > VMOVU (%rsi), %YMM1
> > /* Use compare not equals to directly check for mismatch. */
> > - VPCMP $4, (%rdi), %YMM1, %k1
> > + VPCMP $4,(%rdi), %YMM1, %k1
> > kmovd %k1, %eax
> > /* NB: eax must be destination register if going to
> > L(return_vec_[0,2]). For L(return_vec_3) destination register
> > @@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)
> >
> > /* Check third and fourth VEC no matter what. */
> > VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
> > - VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
> > + VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
> > kmovd %k1, %eax
> > testl %eax, %eax
> > jnz L(return_vec_2)
> >
> > VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
> > - VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
> > + VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
> > kmovd %k1, %ecx
> > testl %ecx, %ecx
> > jnz L(return_vec_3)
> > @@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
> > VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
> > /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
> > oring with YMM1. Result is stored in YMM4. */
> > - vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> > + vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> >
> > /* Or together YMM2, YMM3, and YMM4 into YMM4. */
> > vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> > @@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
> > /* NB: eax must be zero to reach here. */
> > ret
> >
> > - .p2align 4
> > +
> > + .p2align 4,, 8
> > L(8x_end_return_vec_0_1_2_3):
> > movq %rdx, %rdi
> > L(8x_return_vec_0_1_2_3):
> > @@ -222,23 +262,6 @@ L(return_vec_3):
> > # endif
> > ret
> >
> > - .p2align 4
> > -L(return_vec_0):
> > - tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCMP
> > - movl (%rdi, %rax, CHAR_SIZE), %ecx
> > - xorl %edx, %edx
> > - cmpl (%rsi, %rax, CHAR_SIZE), %ecx
> > - /* NB: no partial register stall here because xorl zero idiom
> > - above. */
> > - setg %dl
> > - leal -1(%rdx, %rdx), %eax
> > -# else
> > - movzbl (%rsi, %rax), %ecx
> > - movzbl (%rdi, %rax), %eax
> > - subl %ecx, %eax
> > -# endif
> > - ret
> >
> > .p2align 4
> > L(return_vec_1):
> > @@ -297,7 +320,7 @@ L(loop_4x_vec):
> > VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
> > vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> > VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
> > - vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> > + vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> > vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> > VPTEST %YMM4, %YMM4, %k1
> > kmovd %k1, %ecx
> > @@ -324,7 +347,7 @@ L(loop_4x_vec):
> > VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
> > vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
> > VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
> > - vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
> > + vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
> > vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> > VPTEST %YMM4, %YMM4, %k1
> > kmovd %k1, %ecx
> > @@ -336,14 +359,14 @@ L(loop_4x_vec):
> > /* Only entry is from L(more_8x_vec). */
> > .p2align 4,, 10
> > L(8x_last_2x_vec):
> > - VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
> > + VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
> > kmovd %k1, %eax
> > testl %eax, %eax
> > jnz L(8x_return_vec_2)
> > /* Naturally aligned to 16 bytes. */
> > L(8x_last_1x_vec):
> > VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
> > - VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
> > + VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
> > kmovd %k1, %eax
> > testl %eax, %eax
> > jnz L(8x_return_vec_3)
> > @@ -392,7 +415,9 @@ L(last_1x_vec):
> > jnz L(return_vec_0_end)
> > ret
> >
> > - .p2align 4,, 10
> > +
> > + /* Don't align. Takes 2-fetch blocks either way and aligning
> > + will cause code to spill into another cacheline. */
> > L(return_vec_1_end):
> > /* Use bsf to save code size. This is necessary to have
> > L(one_or_less) fit in aligning bytes between. */
> > @@ -411,31 +436,8 @@ L(return_vec_1_end):
> > # endif
> > ret
> >
> > - /* NB: L(one_or_less) fits in alignment padding between
> > - L(return_vec_1_end) and L(return_vec_0_end). */
> > -# ifdef USE_AS_WMEMCMP
> > -L(one_or_less):
> > - jb L(zero)
> > - movl (%rdi), %ecx
> > - xorl %edx, %edx
> > - cmpl (%rsi), %ecx
> > - je L(zero)
> > - setg %dl
> > - leal -1(%rdx, %rdx), %eax
> > - ret
> > -# else
> > -L(one_or_less):
> > - jb L(zero)
> > - movzbl (%rsi), %ecx
> > - movzbl (%rdi), %eax
> > - subl %ecx, %eax
> > - ret
> > -# endif
> > -L(zero):
> > - xorl %eax, %eax
> > - ret
> > -
> > - .p2align 4
> > + /* Don't align. Takes 2-fetch blocks either way and aligning
> > + will cause code to spill into another cacheline. */
> > L(return_vec_0_end):
> > tzcntl %eax, %eax
> > addl %edx, %eax
> > @@ -451,146 +453,7 @@ L(return_vec_0_end):
> > subl %ecx, %eax
> > # endif
> > ret
> > + /* 1-byte until next cache line. */
> >
> > - .p2align 4
> > -L(less_vec):
> > - /* Check if one or less CHAR. This is necessary for size == 0
> > - but is also faster for size == CHAR_SIZE. */
> > - cmpl $1, %edx
> > - jbe L(one_or_less)
> > -
> > - /* Check if loading one VEC from either s1 or s2 could cause a
> > - page cross. This can have false positives but is by far the
> > - fastest method. */
> > - movl %edi, %eax
> > - orl %esi, %eax
> > - andl $(PAGE_SIZE - 1), %eax
> > - cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > - jg L(page_cross_less_vec)
> > -
> > - /* No page cross possible. */
> > - VMOVU (%rsi), %YMM2
> > - VPCMP $4, (%rdi), %YMM2, %k1
> > - kmovd %k1, %eax
> > - /* Check if any matches where in bounds. Intentionally not
> > - storing result in eax to limit dependency chain if it goes to
> > - L(return_vec_0_lv). */
> > - bzhil %edx, %eax, %edx
> > - jnz L(return_vec_0_lv)
> > - xorl %eax, %eax
> > - ret
> > -
> > - /* Essentially duplicate of L(return_vec_0). Ends up not costing
> > - any code as shrinks L(less_vec) by allowing 2-byte encoding of
> > - the jump and ends up fitting in aligning bytes. As well fits on
> > - same cache line as L(less_vec) so also saves a line from having
> > - to be fetched on cold calls to memcmp. */
> > - .p2align 4,, 4
> > -L(return_vec_0_lv):
> > - tzcntl %eax, %eax
> > -# ifdef USE_AS_WMEMCMP
> > - movl (%rdi, %rax, CHAR_SIZE), %ecx
> > - xorl %edx, %edx
> > - cmpl (%rsi, %rax, CHAR_SIZE), %ecx
> > - /* NB: no partial register stall here because xorl zero idiom
> > - above. */
> > - setg %dl
> > - leal -1(%rdx, %rdx), %eax
> > -# else
> > - movzbl (%rsi, %rax), %ecx
> > - movzbl (%rdi, %rax), %eax
> > - subl %ecx, %eax
> > -# endif
> > - ret
> > -
> > - .p2align 4
> > -L(page_cross_less_vec):
> > - /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
> > - bytes. */
> > - cmpl $(16 / CHAR_SIZE), %edx
> > - jae L(between_16_31)
> > -# ifndef USE_AS_WMEMCMP
> > - cmpl $8, %edx
> > - jae L(between_8_15)
> > - cmpl $4, %edx
> > - jb L(between_2_3)
> > -
> > - /* Load as big endian with overlapping movbe to avoid branches.
> > - */
> > - movbe (%rdi), %eax
> > - movbe (%rsi), %ecx
> > - shlq $32, %rax
> > - shlq $32, %rcx
> > - movbe -4(%rdi, %rdx), %edi
> > - movbe -4(%rsi, %rdx), %esi
> > - orq %rdi, %rax
> > - orq %rsi, %rcx
> > - subq %rcx, %rax
> > - /* edx is guranteed to be positive int32 in range [4, 7]. */
> > - cmovne %edx, %eax
> > - /* ecx is -1 if rcx > rax. Otherwise 0. */
> > - sbbl %ecx, %ecx
> > - /* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
> > - rax then eax and ecx are zero. If rax < rax then ecx is -1 so
> > - eax doesn't matter. */
> > - orl %ecx, %eax
> > - ret
> > -
> > - .p2align 4,, 8
> > -L(between_8_15):
> > -# endif
> > - /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
> > - vmovq (%rdi), %xmm1
> > - vmovq (%rsi), %xmm2
> > - VPCMP $4, %xmm1, %xmm2, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > - jnz L(return_vec_0_lv)
> > - /* Use overlapping loads to avoid branches. */
> > - vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1
> > - vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2
> > - VPCMP $4, %xmm1, %xmm2, %k1
> > - addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > - jnz L(return_vec_0_end)
> > - ret
> > -
> > - .p2align 4,, 8
> > -L(between_16_31):
> > - /* From 16 to 31 bytes. No branch when size == 16. */
> > -
> > - /* Use movups to save code size. */
> > - vmovdqu (%rsi), %xmm2
> > - VPCMP $4, (%rdi), %xmm2, %k1
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > - jnz L(return_vec_0_lv)
> > - /* Use overlapping loads to avoid branches. */
> > - vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
> > - VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
> > - addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
> > - kmovd %k1, %eax
> > - testl %eax, %eax
> > - jnz L(return_vec_0_end)
> > - ret
> > -
> > -# ifndef USE_AS_WMEMCMP
> > -L(between_2_3):
> > - /* Load as big endian to avoid branches. */
> > - movzwl (%rdi), %eax
> > - movzwl (%rsi), %ecx
> > - shll $8, %eax
> > - shll $8, %ecx
> > - bswap %eax
> > - bswap %ecx
> > - movzbl -1(%rdi, %rdx), %edi
> > - movzbl -1(%rsi, %rdx), %esi
> > - orl %edi, %eax
> > - orl %esi, %ecx
> > - /* Subtraction is okay because the upper 8 bits are zero. */
> > - subl %ecx, %eax
> > - ret
> > -# endif
> > END (MEMCMP)
> > #endif
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
Thanks pushed.
>
> --
> H.J.
^ permalink raw reply [flat|nested] 7+ messages in thread
* Re: [PATCH v1 1/2] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
2021-12-27 10:08 ` Noah Goldstein
@ 2022-04-23 1:53 ` Sunil Pandey
0 siblings, 0 replies; 7+ messages in thread
From: Sunil Pandey @ 2022-04-23 1:53 UTC (permalink / raw)
To: Noah Goldstein, libc-stable; +Cc: H.J. Lu, GNU C Library
On Mon, Dec 27, 2021 at 2:08 AM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Sun, Dec 26, 2021 at 11:05 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Dec 24, 2021 at 7:23 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > No bug.
> > > Optimizations are twofold.
> > >
> > > 1) Replace page cross and 0/1 checks with masked load instructions in
> > > L(less_vec). In applications this reduces branch-misses in the
> > > hot [0, 32] case.
> > > 2) Change controlflow so that L(less_vec) case gets the fall through.
> > >
> > > Change 2) helps copies in the [0, 32] size range but comes at the cost
> > > of copies in the [33, 64] size range. From profiles of GCC and
> > > Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
> > > appears to the the right tradeoff.
> > > ---
> > > sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 249 +++++--------------
> > > 1 file changed, 56 insertions(+), 193 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> > > index 640f6757fa..d2899e7c70 100644
> > > --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> > > +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
> > > @@ -62,15 +62,18 @@ Latency:
> > > # define VMOVU vmovdqu64
> > >
> > > # ifdef USE_AS_WMEMCMP
> > > +# define VMOVU_MASK vmovdqu32
> > > # define CHAR_SIZE 4
> > > # define VPCMP vpcmpd
> > > # define VPTEST vptestmd
> > > # else
> > > +# define VMOVU_MASK vmovdqu8
> > > # define CHAR_SIZE 1
> > > # define VPCMP vpcmpub
> > > # define VPTEST vptestmb
> > > # endif
> > >
> > > +
> > > # define VEC_SIZE 32
> > > # define PAGE_SIZE 4096
> > > # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> > > @@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)
> > > movl %edx, %edx
> > > # endif
> > > cmp $CHAR_PER_VEC, %RDX_LP
> > > - jb L(less_vec)
> > > + /* Fall through for [0, VEC_SIZE] as its the hottest. */
> > > + ja L(more_1x_vec)
> > > +
> > > + /* Create mask for CHAR's we want to compare. This allows us to
> > > + avoid having to include page cross logic. */
> > > + movl $-1, %ecx
> > > + bzhil %edx, %ecx, %ecx
> > > + kmovd %ecx, %k2
> > > +
> > > + /* Safe to load full ymm with mask. */
> > > + VMOVU_MASK (%rsi), %YMM2{%k2}
> > > + VPCMP $4,(%rdi), %YMM2, %k1{%k2}
> > > + kmovd %k1, %eax
> > > + testl %eax, %eax
> > > + jnz L(return_vec_0)
> > > + ret
> > >
> > > + .p2align 4
> > > +L(return_vec_0):
> > > + tzcntl %eax, %eax
> > > +# ifdef USE_AS_WMEMCMP
> > > + movl (%rdi, %rax, CHAR_SIZE), %ecx
> > > + xorl %edx, %edx
> > > + cmpl (%rsi, %rax, CHAR_SIZE), %ecx
> > > + /* NB: no partial register stall here because xorl zero idiom
> > > + above. */
> > > + setg %dl
> > > + leal -1(%rdx, %rdx), %eax
> > > +# else
> > > + movzbl (%rsi, %rax), %ecx
> > > + movzbl (%rdi, %rax), %eax
> > > + subl %ecx, %eax
> > > +# endif
> > > + ret
> > > +
> > > +
> > > + .p2align 4
> > > +L(more_1x_vec):
> > > /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
> > > VMOVU (%rsi), %YMM1
> > > /* Use compare not equals to directly check for mismatch. */
> > > - VPCMP $4, (%rdi), %YMM1, %k1
> > > + VPCMP $4,(%rdi), %YMM1, %k1
> > > kmovd %k1, %eax
> > > /* NB: eax must be destination register if going to
> > > L(return_vec_[0,2]). For L(return_vec_3) destination register
> > > @@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)
> > >
> > > /* Check third and fourth VEC no matter what. */
> > > VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
> > > - VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
> > > + VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
> > > kmovd %k1, %eax
> > > testl %eax, %eax
> > > jnz L(return_vec_2)
> > >
> > > VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
> > > - VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
> > > + VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
> > > kmovd %k1, %ecx
> > > testl %ecx, %ecx
> > > jnz L(return_vec_3)
> > > @@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
> > > VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
> > > /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
> > > oring with YMM1. Result is stored in YMM4. */
> > > - vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> > > + vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> > >
> > > /* Or together YMM2, YMM3, and YMM4 into YMM4. */
> > > vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> > > @@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
> > > /* NB: eax must be zero to reach here. */
> > > ret
> > >
> > > - .p2align 4
> > > +
> > > + .p2align 4,, 8
> > > L(8x_end_return_vec_0_1_2_3):
> > > movq %rdx, %rdi
> > > L(8x_return_vec_0_1_2_3):
> > > @@ -222,23 +262,6 @@ L(return_vec_3):
> > > # endif
> > > ret
> > >
> > > - .p2align 4
> > > -L(return_vec_0):
> > > - tzcntl %eax, %eax
> > > -# ifdef USE_AS_WMEMCMP
> > > - movl (%rdi, %rax, CHAR_SIZE), %ecx
> > > - xorl %edx, %edx
> > > - cmpl (%rsi, %rax, CHAR_SIZE), %ecx
> > > - /* NB: no partial register stall here because xorl zero idiom
> > > - above. */
> > > - setg %dl
> > > - leal -1(%rdx, %rdx), %eax
> > > -# else
> > > - movzbl (%rsi, %rax), %ecx
> > > - movzbl (%rdi, %rax), %eax
> > > - subl %ecx, %eax
> > > -# endif
> > > - ret
> > >
> > > .p2align 4
> > > L(return_vec_1):
> > > @@ -297,7 +320,7 @@ L(loop_4x_vec):
> > > VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
> > > vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> > > VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
> > > - vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> > > + vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
> > > vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> > > VPTEST %YMM4, %YMM4, %k1
> > > kmovd %k1, %ecx
> > > @@ -324,7 +347,7 @@ L(loop_4x_vec):
> > > VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
> > > vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
> > > VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
> > > - vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
> > > + vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
> > > vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> > > VPTEST %YMM4, %YMM4, %k1
> > > kmovd %k1, %ecx
> > > @@ -336,14 +359,14 @@ L(loop_4x_vec):
> > > /* Only entry is from L(more_8x_vec). */
> > > .p2align 4,, 10
> > > L(8x_last_2x_vec):
> > > - VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
> > > + VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
> > > kmovd %k1, %eax
> > > testl %eax, %eax
> > > jnz L(8x_return_vec_2)
> > > /* Naturally aligned to 16 bytes. */
> > > L(8x_last_1x_vec):
> > > VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
> > > - VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
> > > + VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
> > > kmovd %k1, %eax
> > > testl %eax, %eax
> > > jnz L(8x_return_vec_3)
> > > @@ -392,7 +415,9 @@ L(last_1x_vec):
> > > jnz L(return_vec_0_end)
> > > ret
> > >
> > > - .p2align 4,, 10
> > > +
> > > + /* Don't align. Takes 2-fetch blocks either way and aligning
> > > + will cause code to spill into another cacheline. */
> > > L(return_vec_1_end):
> > > /* Use bsf to save code size. This is necessary to have
> > > L(one_or_less) fit in aligning bytes between. */
> > > @@ -411,31 +436,8 @@ L(return_vec_1_end):
> > > # endif
> > > ret
> > >
> > > - /* NB: L(one_or_less) fits in alignment padding between
> > > - L(return_vec_1_end) and L(return_vec_0_end). */
> > > -# ifdef USE_AS_WMEMCMP
> > > -L(one_or_less):
> > > - jb L(zero)
> > > - movl (%rdi), %ecx
> > > - xorl %edx, %edx
> > > - cmpl (%rsi), %ecx
> > > - je L(zero)
> > > - setg %dl
> > > - leal -1(%rdx, %rdx), %eax
> > > - ret
> > > -# else
> > > -L(one_or_less):
> > > - jb L(zero)
> > > - movzbl (%rsi), %ecx
> > > - movzbl (%rdi), %eax
> > > - subl %ecx, %eax
> > > - ret
> > > -# endif
> > > -L(zero):
> > > - xorl %eax, %eax
> > > - ret
> > > -
> > > - .p2align 4
> > > + /* Don't align. Takes 2-fetch blocks either way and aligning
> > > + will cause code to spill into another cacheline. */
> > > L(return_vec_0_end):
> > > tzcntl %eax, %eax
> > > addl %edx, %eax
> > > @@ -451,146 +453,7 @@ L(return_vec_0_end):
> > > subl %ecx, %eax
> > > # endif
> > > ret
> > > + /* 1-byte until next cache line. */
> > >
> > > - .p2align 4
> > > -L(less_vec):
> > > - /* Check if one or less CHAR. This is necessary for size == 0
> > > - but is also faster for size == CHAR_SIZE. */
> > > - cmpl $1, %edx
> > > - jbe L(one_or_less)
> > > -
> > > - /* Check if loading one VEC from either s1 or s2 could cause a
> > > - page cross. This can have false positives but is by far the
> > > - fastest method. */
> > > - movl %edi, %eax
> > > - orl %esi, %eax
> > > - andl $(PAGE_SIZE - 1), %eax
> > > - cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > > - jg L(page_cross_less_vec)
> > > -
> > > - /* No page cross possible. */
> > > - VMOVU (%rsi), %YMM2
> > > - VPCMP $4, (%rdi), %YMM2, %k1
> > > - kmovd %k1, %eax
> > > - /* Check if any matches where in bounds. Intentionally not
> > > - storing result in eax to limit dependency chain if it goes to
> > > - L(return_vec_0_lv). */
> > > - bzhil %edx, %eax, %edx
> > > - jnz L(return_vec_0_lv)
> > > - xorl %eax, %eax
> > > - ret
> > > -
> > > - /* Essentially duplicate of L(return_vec_0). Ends up not costing
> > > - any code as shrinks L(less_vec) by allowing 2-byte encoding of
> > > - the jump and ends up fitting in aligning bytes. As well fits on
> > > - same cache line as L(less_vec) so also saves a line from having
> > > - to be fetched on cold calls to memcmp. */
> > > - .p2align 4,, 4
> > > -L(return_vec_0_lv):
> > > - tzcntl %eax, %eax
> > > -# ifdef USE_AS_WMEMCMP
> > > - movl (%rdi, %rax, CHAR_SIZE), %ecx
> > > - xorl %edx, %edx
> > > - cmpl (%rsi, %rax, CHAR_SIZE), %ecx
> > > - /* NB: no partial register stall here because xorl zero idiom
> > > - above. */
> > > - setg %dl
> > > - leal -1(%rdx, %rdx), %eax
> > > -# else
> > > - movzbl (%rsi, %rax), %ecx
> > > - movzbl (%rdi, %rax), %eax
> > > - subl %ecx, %eax
> > > -# endif
> > > - ret
> > > -
> > > - .p2align 4
> > > -L(page_cross_less_vec):
> > > - /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
> > > - bytes. */
> > > - cmpl $(16 / CHAR_SIZE), %edx
> > > - jae L(between_16_31)
> > > -# ifndef USE_AS_WMEMCMP
> > > - cmpl $8, %edx
> > > - jae L(between_8_15)
> > > - cmpl $4, %edx
> > > - jb L(between_2_3)
> > > -
> > > - /* Load as big endian with overlapping movbe to avoid branches.
> > > - */
> > > - movbe (%rdi), %eax
> > > - movbe (%rsi), %ecx
> > > - shlq $32, %rax
> > > - shlq $32, %rcx
> > > - movbe -4(%rdi, %rdx), %edi
> > > - movbe -4(%rsi, %rdx), %esi
> > > - orq %rdi, %rax
> > > - orq %rsi, %rcx
> > > - subq %rcx, %rax
> > > - /* edx is guranteed to be positive int32 in range [4, 7]. */
> > > - cmovne %edx, %eax
> > > - /* ecx is -1 if rcx > rax. Otherwise 0. */
> > > - sbbl %ecx, %ecx
> > > - /* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
> > > - rax then eax and ecx are zero. If rax < rax then ecx is -1 so
> > > - eax doesn't matter. */
> > > - orl %ecx, %eax
> > > - ret
> > > -
> > > - .p2align 4,, 8
> > > -L(between_8_15):
> > > -# endif
> > > - /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
> > > - vmovq (%rdi), %xmm1
> > > - vmovq (%rsi), %xmm2
> > > - VPCMP $4, %xmm1, %xmm2, %k1
> > > - kmovd %k1, %eax
> > > - testl %eax, %eax
> > > - jnz L(return_vec_0_lv)
> > > - /* Use overlapping loads to avoid branches. */
> > > - vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1
> > > - vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2
> > > - VPCMP $4, %xmm1, %xmm2, %k1
> > > - addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
> > > - kmovd %k1, %eax
> > > - testl %eax, %eax
> > > - jnz L(return_vec_0_end)
> > > - ret
> > > -
> > > - .p2align 4,, 8
> > > -L(between_16_31):
> > > - /* From 16 to 31 bytes. No branch when size == 16. */
> > > -
> > > - /* Use movups to save code size. */
> > > - vmovdqu (%rsi), %xmm2
> > > - VPCMP $4, (%rdi), %xmm2, %k1
> > > - kmovd %k1, %eax
> > > - testl %eax, %eax
> > > - jnz L(return_vec_0_lv)
> > > - /* Use overlapping loads to avoid branches. */
> > > - vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
> > > - VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
> > > - addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
> > > - kmovd %k1, %eax
> > > - testl %eax, %eax
> > > - jnz L(return_vec_0_end)
> > > - ret
> > > -
> > > -# ifndef USE_AS_WMEMCMP
> > > -L(between_2_3):
> > > - /* Load as big endian to avoid branches. */
> > > - movzwl (%rdi), %eax
> > > - movzwl (%rsi), %ecx
> > > - shll $8, %eax
> > > - shll $8, %ecx
> > > - bswap %eax
> > > - bswap %ecx
> > > - movzbl -1(%rdi, %rdx), %edi
> > > - movzbl -1(%rsi, %rdx), %esi
> > > - orl %edi, %eax
> > > - orl %esi, %ecx
> > > - /* Subtraction is okay because the upper 8 bits are zero. */
> > > - subl %ecx, %eax
> > > - ret
> > > -# endif
> > > END (MEMCMP)
> > > #endif
> > > --
> > > 2.25.1
> > >
> >
> > LGTM.
> >
> > Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
> >
> > Thanks.
>
> Thanks pushed.
> >
> > --
> > H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 7+ messages in thread