From: "H.J. Lu" <hjl.tools@gmail.com>
To: Noah Goldstein <goldstein.w.n@gmail.com>
Cc: GNU C Library <libc-alpha@sourceware.org>,
"Carlos O'Donell" <carlos@systemhalted.org>
Subject: Re: [PATCH v3 3/3] x86: Cleanup page cross code in memcmp-avx2-movbe.S
Date: Fri, 15 Apr 2022 10:33:35 -0700 [thread overview]
Message-ID: <CAMe9rOofRMrG0Hvjq42r_c-24juXbS+rRb7QXKGjetmSp93Uwg@mail.gmail.com> (raw)
In-Reply-To: <20220415172801.1525674-3-goldstein.w.n@gmail.com>
On Fri, Apr 15, 2022 at 10:28 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Old code was both inefficient and wasted code size. New code (-62
> bytes) and comparable or better performance in the page cross case.
>
> geometric_mean(N=20) of page cross cases New / Original: 0.960
>
> size, align0, align1, ret, New Time/Old Time
> 1, 4095, 0, 0, 1.001
> 1, 4095, 0, 1, 0.999
> 1, 4095, 0, -1, 1.0
> 2, 4094, 0, 0, 1.0
> 2, 4094, 0, 1, 1.0
> 2, 4094, 0, -1, 1.0
> 3, 4093, 0, 0, 1.0
> 3, 4093, 0, 1, 1.0
> 3, 4093, 0, -1, 1.0
> 4, 4092, 0, 0, 0.987
> 4, 4092, 0, 1, 1.0
> 4, 4092, 0, -1, 1.0
> 5, 4091, 0, 0, 0.984
> 5, 4091, 0, 1, 1.002
> 5, 4091, 0, -1, 1.005
> 6, 4090, 0, 0, 0.993
> 6, 4090, 0, 1, 1.001
> 6, 4090, 0, -1, 1.003
> 7, 4089, 0, 0, 0.991
> 7, 4089, 0, 1, 1.0
> 7, 4089, 0, -1, 1.001
> 8, 4088, 0, 0, 0.875
> 8, 4088, 0, 1, 0.881
> 8, 4088, 0, -1, 0.888
> 9, 4087, 0, 0, 0.872
> 9, 4087, 0, 1, 0.879
> 9, 4087, 0, -1, 0.883
> 10, 4086, 0, 0, 0.878
> 10, 4086, 0, 1, 0.886
> 10, 4086, 0, -1, 0.873
> 11, 4085, 0, 0, 0.878
> 11, 4085, 0, 1, 0.881
> 11, 4085, 0, -1, 0.879
> 12, 4084, 0, 0, 0.873
> 12, 4084, 0, 1, 0.889
> 12, 4084, 0, -1, 0.875
> 13, 4083, 0, 0, 0.873
> 13, 4083, 0, 1, 0.863
> 13, 4083, 0, -1, 0.863
> 14, 4082, 0, 0, 0.838
> 14, 4082, 0, 1, 0.869
> 14, 4082, 0, -1, 0.877
> 15, 4081, 0, 0, 0.841
> 15, 4081, 0, 1, 0.869
> 15, 4081, 0, -1, 0.876
> 16, 4080, 0, 0, 0.988
> 16, 4080, 0, 1, 0.99
> 16, 4080, 0, -1, 0.989
> 17, 4079, 0, 0, 0.978
> 17, 4079, 0, 1, 0.981
> 17, 4079, 0, -1, 0.98
> 18, 4078, 0, 0, 0.981
> 18, 4078, 0, 1, 0.98
> 18, 4078, 0, -1, 0.985
> 19, 4077, 0, 0, 0.977
> 19, 4077, 0, 1, 0.979
> 19, 4077, 0, -1, 0.986
> 20, 4076, 0, 0, 0.977
> 20, 4076, 0, 1, 0.986
> 20, 4076, 0, -1, 0.984
> 21, 4075, 0, 0, 0.977
> 21, 4075, 0, 1, 0.983
> 21, 4075, 0, -1, 0.988
> 22, 4074, 0, 0, 0.983
> 22, 4074, 0, 1, 0.994
> 22, 4074, 0, -1, 0.993
> 23, 4073, 0, 0, 0.98
> 23, 4073, 0, 1, 0.992
> 23, 4073, 0, -1, 0.995
> 24, 4072, 0, 0, 0.989
> 24, 4072, 0, 1, 0.989
> 24, 4072, 0, -1, 0.991
> 25, 4071, 0, 0, 0.99
> 25, 4071, 0, 1, 0.999
> 25, 4071, 0, -1, 0.996
> 26, 4070, 0, 0, 0.993
> 26, 4070, 0, 1, 0.995
> 26, 4070, 0, -1, 0.998
> 27, 4069, 0, 0, 0.993
> 27, 4069, 0, 1, 0.999
> 27, 4069, 0, -1, 1.0
> 28, 4068, 0, 0, 0.997
> 28, 4068, 0, 1, 1.0
> 28, 4068, 0, -1, 0.999
> 29, 4067, 0, 0, 0.996
> 29, 4067, 0, 1, 0.999
> 29, 4067, 0, -1, 0.999
> 30, 4066, 0, 0, 0.991
> 30, 4066, 0, 1, 1.001
> 30, 4066, 0, -1, 0.999
> 31, 4065, 0, 0, 0.988
> 31, 4065, 0, 1, 0.998
> 31, 4065, 0, -1, 0.998
> ---
> sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
> 1 file changed, 61 insertions(+), 37 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
> index a34ea1645d..210c9925b6 100644
> --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
> +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
> @@ -429,22 +429,21 @@ L(page_cross_less_vec):
> # ifndef USE_AS_WMEMCMP
> cmpl $8, %edx
> jae L(between_8_15)
> + /* Fall through for [4, 7]. */
> cmpl $4, %edx
> - jae L(between_4_7)
> + jb L(between_2_3)
>
> - /* Load as big endian to avoid branches. */
> - movzwl (%rdi), %eax
> - movzwl (%rsi), %ecx
> - shll $8, %eax
> - shll $8, %ecx
> - bswap %eax
> - bswap %ecx
> - movzbl -1(%rdi, %rdx), %edi
> - movzbl -1(%rsi, %rdx), %esi
> - orl %edi, %eax
> - orl %esi, %ecx
> - /* Subtraction is okay because the upper 8 bits are zero. */
> - subl %ecx, %eax
> + movbe (%rdi), %eax
> + movbe (%rsi), %ecx
> + shlq $32, %rax
> + shlq $32, %rcx
> + movbe -4(%rdi, %rdx), %edi
> + movbe -4(%rsi, %rdx), %esi
> + orq %rdi, %rax
> + orq %rsi, %rcx
> + subq %rcx, %rax
> + /* Fast path for return zero. */
> + jnz L(ret_nonzero)
> /* No ymm register was touched. */
> ret
>
> @@ -457,9 +456,33 @@ L(one_or_less):
> /* No ymm register was touched. */
> ret
>
> + .p2align 4,, 5
> +L(ret_nonzero):
> + sbbl %eax, %eax
> + orl $1, %eax
> + /* No ymm register was touched. */
> + ret
> +
> + .p2align 4,, 2
> +L(zero):
> + xorl %eax, %eax
> + /* No ymm register was touched. */
> + ret
> +
> .p2align 4
> L(between_8_15):
> -# endif
> + movbe (%rdi), %rax
> + movbe (%rsi), %rcx
> + subq %rcx, %rax
> + jnz L(ret_nonzero)
> + movbe -8(%rdi, %rdx), %rax
> + movbe -8(%rsi, %rdx), %rcx
> + subq %rcx, %rax
> + /* Fast path for return zero. */
> + jnz L(ret_nonzero)
> + /* No ymm register was touched. */
> + ret
> +# else
> /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
> vmovq (%rdi), %xmm1
> vmovq (%rsi), %xmm2
> @@ -475,16 +498,13 @@ L(between_8_15):
> VPCMPEQ %xmm1, %xmm2, %xmm2
> vpmovmskb %xmm2, %eax
> subl $0xffff, %eax
> + /* Fast path for return zero. */
> jnz L(return_vec_0)
> /* No ymm register was touched. */
> ret
> +# endif
>
> - .p2align 4
> -L(zero):
> - xorl %eax, %eax
> - ret
> -
> - .p2align 4
> + .p2align 4,, 10
> L(between_16_31):
> /* From 16 to 31 bytes. No branch when size == 16. */
> vmovdqu (%rsi), %xmm2
> @@ -501,11 +521,17 @@ L(between_16_31):
> VPCMPEQ (%rdi), %xmm2, %xmm2
> vpmovmskb %xmm2, %eax
> subl $0xffff, %eax
> + /* Fast path for return zero. */
> jnz L(return_vec_0)
> /* No ymm register was touched. */
> ret
>
> # ifdef USE_AS_WMEMCMP
> + .p2align 4,, 2
> +L(zero):
> + xorl %eax, %eax
> + ret
> +
> .p2align 4
> L(one_or_less):
> jb L(zero)
> @@ -520,22 +546,20 @@ L(one_or_less):
> # else
>
> .p2align 4
> -L(between_4_7):
> - /* Load as big endian with overlapping movbe to avoid branches.
> - */
> - movbe (%rdi), %eax
> - movbe (%rsi), %ecx
> - shlq $32, %rax
> - shlq $32, %rcx
> - movbe -4(%rdi, %rdx), %edi
> - movbe -4(%rsi, %rdx), %esi
> - orq %rdi, %rax
> - orq %rsi, %rcx
> - subq %rcx, %rax
> - jz L(zero_4_7)
> - sbbl %eax, %eax
> - orl $1, %eax
> -L(zero_4_7):
> +L(between_2_3):
> + /* Load as big endian to avoid branches. */
> + movzwl (%rdi), %eax
> + movzwl (%rsi), %ecx
> + bswap %eax
> + bswap %ecx
> + shrl %eax
> + shrl %ecx
> + movzbl -1(%rdi, %rdx), %edi
> + movzbl -1(%rsi, %rdx), %esi
> + orl %edi, %eax
> + orl %esi, %ecx
> + /* Subtraction is okay because the upper bit is zero. */
> + subl %ecx, %eax
> /* No ymm register was touched. */
> ret
> # endif
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
next prev parent reply other threads:[~2022-04-15 17:34 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-04-15 5:51 [PATCH v1 1/3] x86: Optimize memcmp SSE2 in memcmp.S Noah Goldstein
2022-04-15 5:51 ` [PATCH v1 2/3] x86: Remove memcmp-sse4.S Noah Goldstein
2022-04-15 17:20 ` H.J. Lu
2022-04-15 17:29 ` Noah Goldstein
2022-04-15 5:51 ` [PATCH v1 3/3] x86: Cleanup page cross code in memcmp-avx2-movbe.S Noah Goldstein
2022-04-15 17:21 ` H.J. Lu
2022-04-15 17:13 ` [PATCH v1 1/3] x86: Optimize memcmp SSE2 in memcmp.S H.J. Lu
2022-04-15 17:25 ` [PATCH v2 " Noah Goldstein
2022-04-15 17:25 ` [PATCH v2 2/3] x86: Remove memcmp-sse4.S Noah Goldstein
2022-04-15 17:25 ` [PATCH v2 3/3] x86: Cleanup page cross code in memcmp-avx2-movbe.S Noah Goldstein
2022-04-15 17:27 ` [PATCH v3 1/3] x86: Optimize memcmp SSE2 in memcmp.S Noah Goldstein
2022-04-15 17:28 ` [PATCH v3 2/3] x86: Remove memcmp-sse4.S Noah Goldstein
2022-04-15 17:32 ` H.J. Lu
2022-05-12 20:01 ` Sunil Pandey
2022-04-15 17:28 ` [PATCH v3 3/3] x86: Cleanup page cross code in memcmp-avx2-movbe.S Noah Goldstein
2022-04-15 17:33 ` H.J. Lu [this message]
2022-05-12 20:03 ` Sunil Pandey
2022-04-15 17:33 ` [PATCH v3 1/3] x86: Optimize memcmp SSE2 in memcmp.S H.J. Lu
2022-04-15 18:44 ` Noah Goldstein
2022-04-19 20:30 ` Joseph Myers
2022-04-19 20:53 ` Noah Goldstein
2022-04-19 22:56 ` Noah Goldstein
2022-04-20 1:19 ` Noah Goldstein
2022-05-12 19:59 ` Sunil Pandey
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CAMe9rOofRMrG0Hvjq42r_c-24juXbS+rRb7QXKGjetmSp93Uwg@mail.gmail.com \
--to=hjl.tools@gmail.com \
--cc=carlos@systemhalted.org \
--cc=goldstein.w.n@gmail.com \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).