From: "H.J. Lu" <hjl.tools@gmail.com>
To: Noah Goldstein <goldstein.w.n@gmail.com>
Cc: GNU C Library <libc-alpha@sourceware.org>,
"Carlos O'Donell" <carlos@systemhalted.org>
Subject: Re: [PATCH v1] x86-64: Small improvements to dl-trampoline.S
Date: Tue, 28 Jun 2022 11:15:24 -0700 [thread overview]
Message-ID: <CAMe9rOqLTpyjM4GCePftsQuhe-qWHBDgUb4mpFxOS2SPCa5ogQ@mail.gmail.com> (raw)
In-Reply-To: <20220628152628.17802-1-goldstein.w.n@gmail.com>
On Tue, Jun 28, 2022 at 8:26 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> 1. Remove sse2 instructions when using the avx512 or avx version.
>
> 2. Fixup some format nits in how the address offsets where aligned.
>
> 3. Use more space efficient instructions in the conditional AVX
> restoral.
> - vpcmpeqq -> vpcmpeqb
> - cmp imm32, r; jz -> inc r; jz
>
> 4. Use `rep movsb` instead of `rep movsq`. The former is guranteed to
> be fast with the ERMS flags, the latter is not. The latter also
> wastes an instruction in size setup.
> ---
> sysdeps/x86_64/dl-trampoline.S | 4 ++
> sysdeps/x86_64/dl-trampoline.h | 113 +++++++++++++++++----------------
> 2 files changed, 61 insertions(+), 56 deletions(-)
>
> diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> index f669805ac5..580d2b6499 100644
> --- a/sysdeps/x86_64/dl-trampoline.S
> +++ b/sysdeps/x86_64/dl-trampoline.S
> @@ -57,22 +57,26 @@
> #define VMOVA vmovdqa64
> #define VEC(i) zmm##i
> #define _dl_runtime_profile _dl_runtime_profile_avx512
> +# define SECTION(p) p##.evex512
> #include "dl-trampoline.h"
> #undef _dl_runtime_profile
> #undef VEC
> #undef VMOVA
> #undef VEC_SIZE
> +#undef SECTION
>
> #if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
> # define VEC_SIZE 32
> # define VMOVA vmovdqa
> # define VEC(i) ymm##i
> +# define SECTION(p) p##.avx
> # define _dl_runtime_profile _dl_runtime_profile_avx
> # include "dl-trampoline.h"
> # undef _dl_runtime_profile
> # undef VEC
> # undef VMOVA
> # undef VEC_SIZE
> +# undef SECTION
> #endif
>
> #if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
> diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
> index 03bd91b3e9..3c419047ec 100644
> --- a/sysdeps/x86_64/dl-trampoline.h
> +++ b/sysdeps/x86_64/dl-trampoline.h
> @@ -16,7 +16,11 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> - .text
> +#ifndef SECTION
> +# define SECTION(p) p
> +#endif
> +
> + .section SECTION(.text),"ax",@progbits
> #ifdef _dl_runtime_resolve
>
> # undef REGISTER_SAVE_AREA
> @@ -219,19 +223,19 @@ _dl_runtime_profile:
> /* We always store the XMM registers even if AVX is available.
> This is to provide backward binary compatibility for existing
> audit modules. */
> - movaps %xmm0, (LR_XMM_OFFSET)(%rsp)
> - movaps %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
> - movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
> - movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
> - movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
> - movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
> - movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
> - movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
> + VMOVA %xmm0, (LR_XMM_OFFSET + XMM_SIZE*0)(%rsp)
> + VMOVA %xmm1, (LR_XMM_OFFSET + XMM_SIZE*1)(%rsp)
> + VMOVA %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
> + VMOVA %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
> + VMOVA %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
> + VMOVA %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
> + VMOVA %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
> + VMOVA %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
>
> # ifdef RESTORE_AVX
> /* This is to support AVX audit modules. */
> - VMOVA %VEC(0), (LR_VECTOR_OFFSET)(%rsp)
> - VMOVA %VEC(1), (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
> + VMOVA %VEC(0), (LR_VECTOR_OFFSET + VECTOR_SIZE*0)(%rsp)
> + VMOVA %VEC(1), (LR_VECTOR_OFFSET + VECTOR_SIZE*1)(%rsp)
> VMOVA %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
> VMOVA %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
> VMOVA %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
> @@ -241,8 +245,8 @@ _dl_runtime_profile:
>
> /* Save xmm0-xmm7 registers to detect if any of them are
> changed by audit module. */
> - vmovdqa %xmm0, (LR_SIZE)(%rsp)
> - vmovdqa %xmm1, (LR_SIZE + XMM_SIZE)(%rsp)
> + vmovdqa %xmm0, (LR_SIZE + XMM_SIZE*0)(%rsp)
> + vmovdqa %xmm1, (LR_SIZE + XMM_SIZE*1)(%rsp)
> vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp)
> vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp)
> vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp)
> @@ -265,84 +269,84 @@ _dl_runtime_profile:
> movq LR_R8_OFFSET(%rsp), %r8
> movq LR_R9_OFFSET(%rsp), %r9
>
> - movaps (LR_XMM_OFFSET)(%rsp), %xmm0
> - movaps (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
> - movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
> - movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
> - movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
> - movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
> - movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
> - movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
> + VMOVA (LR_XMM_OFFSET + XMM_SIZE*0)(%rsp), %xmm0
> + VMOVA (LR_XMM_OFFSET + XMM_SIZE*1)(%rsp), %xmm1
> + VMOVA (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
> + VMOVA (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
> + VMOVA (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
> + VMOVA (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
> + VMOVA (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
> + VMOVA (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
>
> # ifdef RESTORE_AVX
> /* Check if any xmm0-xmm7 registers are changed by audit
> module. */
> - vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
> + vpcmpeqb (LR_SIZE)(%rsp), %xmm0, %xmm8
> vpmovmskb %xmm8, %esi
> - cmpl $0xffff, %esi
> + incw %si
> je 2f
> vmovdqa %xmm0, (LR_VECTOR_OFFSET)(%rsp)
> jmp 1f
> 2: VMOVA (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
> vmovdqa %xmm0, (LR_XMM_OFFSET)(%rsp)
>
> -1: vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
> +1: vpcmpeqb (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
> vpmovmskb %xmm8, %esi
> - cmpl $0xffff, %esi
> + incw %si
> je 2f
> vmovdqa %xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
> jmp 1f
> 2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
> vmovdqa %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
>
> -1: vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
> +1: vpcmpeqb (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
> vpmovmskb %xmm8, %esi
> - cmpl $0xffff, %esi
> + incw %si
> je 2f
> vmovdqa %xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
> jmp 1f
> 2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
> vmovdqa %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
>
> -1: vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
> +1: vpcmpeqb (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
> vpmovmskb %xmm8, %esi
> - cmpl $0xffff, %esi
> + incw %si
> je 2f
> vmovdqa %xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
> jmp 1f
> 2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
> vmovdqa %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
>
> -1: vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
> +1: vpcmpeqb (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
> vpmovmskb %xmm8, %esi
> - cmpl $0xffff, %esi
> + incw %si
> je 2f
> vmovdqa %xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
> jmp 1f
> 2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
> vmovdqa %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
>
> -1: vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
> +1: vpcmpeqb (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
> vpmovmskb %xmm8, %esi
> - cmpl $0xffff, %esi
> + incw %si
> je 2f
> vmovdqa %xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
> jmp 1f
> 2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
> vmovdqa %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
>
> -1: vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
> +1: vpcmpeqb (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
> vpmovmskb %xmm8, %esi
> - cmpl $0xffff, %esi
> + incw %si
> je 2f
> vmovdqa %xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
> jmp 1f
> 2: VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
> vmovdqa %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
>
> -1: vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
> +1: vpcmpeqb (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
> vpmovmskb %xmm8, %esi
> - cmpl $0xffff, %esi
> + incw %si
> je 2f
> vmovdqa %xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
> jmp 1f
> @@ -352,8 +356,8 @@ _dl_runtime_profile:
> 1:
> # endif
>
> - mov 16(%rbx), %R10_LP # Anything in framesize?
> - test %R10_LP, %R10_LP
> + mov 16(%rbx), %RCX_LP # Anything in framesize?
> + test %RCX_LP, %RCX_LP
> jns 3f
>
> /* There's nothing in the frame size, so there
> @@ -385,14 +389,11 @@ _dl_runtime_profile:
> returned from _dl_profile_fixup */
>
> lea LR_RSP_OFFSET(%rbx), %RSI_LP # stack
> - add $8, %R10_LP
> - and $-16, %R10_LP
> - mov %R10_LP, %RCX_LP
> - sub %R10_LP, %RSP_LP
> + add $8, %RCX_LP
> + and $-16, %RCX_LP
> + sub %RCX_LP, %RSP_LP
> mov %RSP_LP, %RDI_LP
> - shr $3, %RCX_LP
> - rep
> - movsq
> + rep movsb
>
> movq 24(%rdi), %rcx # Get back register content.
> movq 32(%rdi), %rsi
> @@ -428,8 +429,8 @@ _dl_runtime_profile:
> movq %rax, LRV_RAX_OFFSET(%rcx)
> movq %rdx, LRV_RDX_OFFSET(%rcx)
>
> - movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
> - movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
> + VMOVA %xmm0, LRV_XMM0_OFFSET(%rcx)
> + VMOVA %xmm1, LRV_XMM1_OFFSET(%rcx)
>
> # ifdef RESTORE_AVX
> /* This is to support AVX audit modules. */
> @@ -438,8 +439,8 @@ _dl_runtime_profile:
>
> /* Save xmm0/xmm1 registers to detect if they are changed
> by audit module. */
> - vmovdqa %xmm0, (LRV_SIZE)(%rcx)
> - vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
> + vmovdqa %xmm0, (LRV_SIZE + XMM_SIZE*0)(%rcx)
> + vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE*1)(%rcx)
> # endif
>
> fstpt LRV_ST0_OFFSET(%rcx)
> @@ -454,20 +455,20 @@ _dl_runtime_profile:
> movq LRV_RAX_OFFSET(%rsp), %rax
> movq LRV_RDX_OFFSET(%rsp), %rdx
>
> - movaps LRV_XMM0_OFFSET(%rsp), %xmm0
> - movaps LRV_XMM1_OFFSET(%rsp), %xmm1
> + VMOVA LRV_XMM0_OFFSET(%rsp), %xmm0
> + VMOVA LRV_XMM1_OFFSET(%rsp), %xmm1
>
> # ifdef RESTORE_AVX
> /* Check if xmm0/xmm1 registers are changed by audit module. */
> - vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
> + vpcmpeqb (LRV_SIZE)(%rsp), %xmm0, %xmm2
> vpmovmskb %xmm2, %esi
> - cmpl $0xffff, %esi
> + incw %si
> jne 1f
> VMOVA LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
>
> -1: vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
> +1: vpcmpeqb (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
> vpmovmskb %xmm2, %esi
> - cmpl $0xffff, %esi
> + incw %si
> jne 1f
> VMOVA LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
>
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
prev parent reply other threads:[~2022-06-28 18:16 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-06-28 15:26 Noah Goldstein
2022-06-28 15:26 ` [PATCH v1] x86: Add support for building strstr with explicit ISA level Noah Goldstein
2022-06-28 18:20 ` H.J. Lu
2022-06-28 18:24 ` Noah Goldstein
2022-06-28 18:34 ` H.J. Lu
2022-06-28 18:38 ` Noah Goldstein
2022-06-28 15:26 ` [PATCH v1] x86: Add support for building {w}memcmp{eq} " Noah Goldstein
2022-06-29 18:52 ` H.J. Lu
2022-06-29 19:27 ` Noah Goldstein
2022-06-29 19:41 ` H.J. Lu
2022-06-29 19:44 ` Noah Goldstein
2022-06-29 19:48 ` H.J. Lu
2022-06-29 22:09 ` Noah Goldstein
2022-06-29 22:09 ` [PATCH v2] " Noah Goldstein
2022-06-29 22:49 ` H.J. Lu
2022-06-29 23:11 ` Noah Goldstein
2022-06-29 23:11 ` [PATCH v3] " Noah Goldstein
2022-07-01 22:56 ` H.J. Lu
2022-06-28 18:15 ` H.J. Lu [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CAMe9rOqLTpyjM4GCePftsQuhe-qWHBDgUb4mpFxOS2SPCa5ogQ@mail.gmail.com \
--to=hjl.tools@gmail.com \
--cc=carlos@systemhalted.org \
--cc=goldstein.w.n@gmail.com \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).