Re: [PATCH v1] x86-64: Small improvements to dl-trampoline.S

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

From: "H.J. Lu" <hjl.tools@gmail.com>
To: Noah Goldstein <goldstein.w.n@gmail.com>
Cc: GNU C Library <libc-alpha@sourceware.org>,
	"Carlos O'Donell" <carlos@systemhalted.org>
Subject: Re: [PATCH v1] x86-64: Small improvements to dl-trampoline.S
Date: Tue, 28 Jun 2022 11:15:24 -0700	[thread overview]
Message-ID: <CAMe9rOqLTpyjM4GCePftsQuhe-qWHBDgUb4mpFxOS2SPCa5ogQ@mail.gmail.com> (raw)
In-Reply-To: <20220628152628.17802-1-goldstein.w.n@gmail.com>

On Tue, Jun 28, 2022 at 8:26 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> 1.  Remove sse2 instructions when using the avx512 or avx version.
>
> 2.  Fixup some format nits in how the address offsets where aligned.
>
> 3.  Use more space efficient instructions in the conditional AVX
>     restoral.
>         - vpcmpeqq          -> vpcmpeqb
>         - cmp imm32, r; jz  -> inc r; jz
>
> 4.  Use `rep movsb` instead of `rep movsq`. The former is guranteed to
>     be fast with the ERMS flags, the latter is not. The latter also
>     wastes an instruction in size setup.
> ---
>  sysdeps/x86_64/dl-trampoline.S |   4 ++
>  sysdeps/x86_64/dl-trampoline.h | 113 +++++++++++++++++----------------
>  2 files changed, 61 insertions(+), 56 deletions(-)
>
> diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> index f669805ac5..580d2b6499 100644
> --- a/sysdeps/x86_64/dl-trampoline.S
> +++ b/sysdeps/x86_64/dl-trampoline.S
> @@ -57,22 +57,26 @@
>  #define VMOVA                  vmovdqa64
>  #define VEC(i)                 zmm##i
>  #define _dl_runtime_profile    _dl_runtime_profile_avx512
> +# define SECTION(p)            p##.evex512
>  #include "dl-trampoline.h"
>  #undef _dl_runtime_profile
>  #undef VEC
>  #undef VMOVA
>  #undef VEC_SIZE
> +#undef SECTION
>
>  #if MINIMUM_X86_ISA_LEVEL <= AVX_X86_ISA_LEVEL
>  # define VEC_SIZE              32
>  # define VMOVA                 vmovdqa
>  # define VEC(i)                        ymm##i
> +# define SECTION(p)            p##.avx
>  # define _dl_runtime_profile   _dl_runtime_profile_avx
>  # include "dl-trampoline.h"
>  # undef _dl_runtime_profile
>  # undef VEC
>  # undef VMOVA
>  # undef VEC_SIZE
> +# undef SECTION
>  #endif
>
>  #if MINIMUM_X86_ISA_LEVEL < AVX_X86_ISA_LEVEL
> diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
> index 03bd91b3e9..3c419047ec 100644
> --- a/sysdeps/x86_64/dl-trampoline.h
> +++ b/sysdeps/x86_64/dl-trampoline.h
> @@ -16,7 +16,11 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -       .text
> +#ifndef SECTION
> +# define SECTION(p)    p
> +#endif
> +
> +       .section SECTION(.text),"ax",@progbits
>  #ifdef _dl_runtime_resolve
>
>  # undef REGISTER_SAVE_AREA
> @@ -219,19 +223,19 @@ _dl_runtime_profile:
>         /* We always store the XMM registers even if AVX is available.
>            This is to provide backward binary compatibility for existing
>            audit modules.  */
> -       movaps %xmm0,              (LR_XMM_OFFSET)(%rsp)
> -       movaps %xmm1, (LR_XMM_OFFSET +   XMM_SIZE)(%rsp)
> -       movaps %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
> -       movaps %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
> -       movaps %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
> -       movaps %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
> -       movaps %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
> -       movaps %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
> +       VMOVA %xmm0, (LR_XMM_OFFSET + XMM_SIZE*0)(%rsp)
> +       VMOVA %xmm1, (LR_XMM_OFFSET + XMM_SIZE*1)(%rsp)
> +       VMOVA %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
> +       VMOVA %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
> +       VMOVA %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
> +       VMOVA %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
> +       VMOVA %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
> +       VMOVA %xmm7, (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp)
>
>  # ifdef RESTORE_AVX
>         /* This is to support AVX audit modules.  */
> -       VMOVA %VEC(0),                (LR_VECTOR_OFFSET)(%rsp)
> -       VMOVA %VEC(1), (LR_VECTOR_OFFSET +   VECTOR_SIZE)(%rsp)
> +       VMOVA %VEC(0), (LR_VECTOR_OFFSET + VECTOR_SIZE*0)(%rsp)
> +       VMOVA %VEC(1), (LR_VECTOR_OFFSET + VECTOR_SIZE*1)(%rsp)
>         VMOVA %VEC(2), (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
>         VMOVA %VEC(3), (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
>         VMOVA %VEC(4), (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
> @@ -241,8 +245,8 @@ _dl_runtime_profile:
>
>         /* Save xmm0-xmm7 registers to detect if any of them are
>            changed by audit module.  */
> -       vmovdqa %xmm0,              (LR_SIZE)(%rsp)
> -       vmovdqa %xmm1, (LR_SIZE +   XMM_SIZE)(%rsp)
> +       vmovdqa %xmm0, (LR_SIZE + XMM_SIZE*0)(%rsp)
> +       vmovdqa %xmm1, (LR_SIZE + XMM_SIZE*1)(%rsp)
>         vmovdqa %xmm2, (LR_SIZE + XMM_SIZE*2)(%rsp)
>         vmovdqa %xmm3, (LR_SIZE + XMM_SIZE*3)(%rsp)
>         vmovdqa %xmm4, (LR_SIZE + XMM_SIZE*4)(%rsp)
> @@ -265,84 +269,84 @@ _dl_runtime_profile:
>         movq  LR_R8_OFFSET(%rsp), %r8
>         movq  LR_R9_OFFSET(%rsp), %r9
>
> -       movaps              (LR_XMM_OFFSET)(%rsp), %xmm0
> -       movaps   (LR_XMM_OFFSET + XMM_SIZE)(%rsp), %xmm1
> -       movaps (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
> -       movaps (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
> -       movaps (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
> -       movaps (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
> -       movaps (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
> -       movaps (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
> +       VMOVA (LR_XMM_OFFSET + XMM_SIZE*0)(%rsp), %xmm0
> +       VMOVA (LR_XMM_OFFSET + XMM_SIZE*1)(%rsp), %xmm1
> +       VMOVA (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp), %xmm2
> +       VMOVA (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp), %xmm3
> +       VMOVA (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp), %xmm4
> +       VMOVA (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp), %xmm5
> +       VMOVA (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp), %xmm6
> +       VMOVA (LR_XMM_OFFSET + XMM_SIZE*7)(%rsp), %xmm7
>
>  # ifdef RESTORE_AVX
>         /* Check if any xmm0-xmm7 registers are changed by audit
>            module.  */
> -       vpcmpeqq (LR_SIZE)(%rsp), %xmm0, %xmm8
> +       vpcmpeqb (LR_SIZE)(%rsp), %xmm0, %xmm8
>         vpmovmskb %xmm8, %esi
> -       cmpl $0xffff, %esi
> +       incw %si
>         je 2f
>         vmovdqa %xmm0, (LR_VECTOR_OFFSET)(%rsp)
>         jmp 1f
>  2:     VMOVA (LR_VECTOR_OFFSET)(%rsp), %VEC(0)
>         vmovdqa %xmm0, (LR_XMM_OFFSET)(%rsp)
>
> -1:     vpcmpeqq (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
> +1:     vpcmpeqb (LR_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm8
>         vpmovmskb %xmm8, %esi
> -       cmpl $0xffff, %esi
> +       incw %si
>         je 2f
>         vmovdqa %xmm1, (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp)
>         jmp 1f
>  2:     VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE)(%rsp), %VEC(1)
>         vmovdqa %xmm1, (LR_XMM_OFFSET + XMM_SIZE)(%rsp)
>
> -1:     vpcmpeqq (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
> +1:     vpcmpeqb (LR_SIZE + XMM_SIZE*2)(%rsp), %xmm2, %xmm8
>         vpmovmskb %xmm8, %esi
> -       cmpl $0xffff, %esi
> +       incw %si
>         je 2f
>         vmovdqa %xmm2, (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp)
>         jmp 1f
>  2:     VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*2)(%rsp), %VEC(2)
>         vmovdqa %xmm2, (LR_XMM_OFFSET + XMM_SIZE*2)(%rsp)
>
> -1:     vpcmpeqq (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
> +1:     vpcmpeqb (LR_SIZE + XMM_SIZE*3)(%rsp), %xmm3, %xmm8
>         vpmovmskb %xmm8, %esi
> -       cmpl $0xffff, %esi
> +       incw %si
>         je 2f
>         vmovdqa %xmm3, (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp)
>         jmp 1f
>  2:     VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*3)(%rsp), %VEC(3)
>         vmovdqa %xmm3, (LR_XMM_OFFSET + XMM_SIZE*3)(%rsp)
>
> -1:     vpcmpeqq (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
> +1:     vpcmpeqb (LR_SIZE + XMM_SIZE*4)(%rsp), %xmm4, %xmm8
>         vpmovmskb %xmm8, %esi
> -       cmpl $0xffff, %esi
> +       incw %si
>         je 2f
>         vmovdqa %xmm4, (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp)
>         jmp 1f
>  2:     VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*4)(%rsp), %VEC(4)
>         vmovdqa %xmm4, (LR_XMM_OFFSET + XMM_SIZE*4)(%rsp)
>
> -1:     vpcmpeqq (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
> +1:     vpcmpeqb (LR_SIZE + XMM_SIZE*5)(%rsp), %xmm5, %xmm8
>         vpmovmskb %xmm8, %esi
> -       cmpl $0xffff, %esi
> +       incw %si
>         je 2f
>         vmovdqa %xmm5, (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp)
>         jmp 1f
>  2:     VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*5)(%rsp), %VEC(5)
>         vmovdqa %xmm5, (LR_XMM_OFFSET + XMM_SIZE*5)(%rsp)
>
> -1:     vpcmpeqq (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
> +1:     vpcmpeqb (LR_SIZE + XMM_SIZE*6)(%rsp), %xmm6, %xmm8
>         vpmovmskb %xmm8, %esi
> -       cmpl $0xffff, %esi
> +       incw %si
>         je 2f
>         vmovdqa %xmm6, (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp)
>         jmp 1f
>  2:     VMOVA (LR_VECTOR_OFFSET + VECTOR_SIZE*6)(%rsp), %VEC(6)
>         vmovdqa %xmm6, (LR_XMM_OFFSET + XMM_SIZE*6)(%rsp)
>
> -1:     vpcmpeqq (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
> +1:     vpcmpeqb (LR_SIZE + XMM_SIZE*7)(%rsp), %xmm7, %xmm8
>         vpmovmskb %xmm8, %esi
> -       cmpl $0xffff, %esi
> +       incw %si
>         je 2f
>         vmovdqa %xmm7, (LR_VECTOR_OFFSET + VECTOR_SIZE*7)(%rsp)
>         jmp 1f
> @@ -352,8 +356,8 @@ _dl_runtime_profile:
>  1:
>  # endif
>
> -       mov  16(%rbx), %R10_LP  # Anything in framesize?
> -       test %R10_LP, %R10_LP
> +       mov  16(%rbx), %RCX_LP  # Anything in framesize?
> +       test %RCX_LP, %RCX_LP
>         jns 3f
>
>         /* There's nothing in the frame size, so there
> @@ -385,14 +389,11 @@ _dl_runtime_profile:
>            returned from _dl_profile_fixup */
>
>         lea LR_RSP_OFFSET(%rbx), %RSI_LP # stack
> -       add $8, %R10_LP
> -       and $-16, %R10_LP
> -       mov %R10_LP, %RCX_LP
> -       sub %R10_LP, %RSP_LP
> +       add $8, %RCX_LP
> +       and $-16, %RCX_LP
> +       sub %RCX_LP, %RSP_LP
>         mov %RSP_LP, %RDI_LP
> -       shr $3, %RCX_LP
> -       rep
> -       movsq
> +       rep movsb
>
>         movq 24(%rdi), %rcx     # Get back register content.
>         movq 32(%rdi), %rsi
> @@ -428,8 +429,8 @@ _dl_runtime_profile:
>         movq %rax, LRV_RAX_OFFSET(%rcx)
>         movq %rdx, LRV_RDX_OFFSET(%rcx)
>
> -       movaps %xmm0, LRV_XMM0_OFFSET(%rcx)
> -       movaps %xmm1, LRV_XMM1_OFFSET(%rcx)
> +       VMOVA %xmm0, LRV_XMM0_OFFSET(%rcx)
> +       VMOVA %xmm1, LRV_XMM1_OFFSET(%rcx)
>
>  # ifdef RESTORE_AVX
>         /* This is to support AVX audit modules.  */
> @@ -438,8 +439,8 @@ _dl_runtime_profile:
>
>         /* Save xmm0/xmm1 registers to detect if they are changed
>            by audit module.  */
> -       vmovdqa %xmm0,            (LRV_SIZE)(%rcx)
> -       vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE)(%rcx)
> +       vmovdqa %xmm0, (LRV_SIZE + XMM_SIZE*0)(%rcx)
> +       vmovdqa %xmm1, (LRV_SIZE + XMM_SIZE*1)(%rcx)
>  # endif
>
>         fstpt LRV_ST0_OFFSET(%rcx)
> @@ -454,20 +455,20 @@ _dl_runtime_profile:
>         movq LRV_RAX_OFFSET(%rsp), %rax
>         movq LRV_RDX_OFFSET(%rsp), %rdx
>
> -       movaps LRV_XMM0_OFFSET(%rsp), %xmm0
> -       movaps LRV_XMM1_OFFSET(%rsp), %xmm1
> +       VMOVA LRV_XMM0_OFFSET(%rsp), %xmm0
> +       VMOVA LRV_XMM1_OFFSET(%rsp), %xmm1
>
>  # ifdef RESTORE_AVX
>         /* Check if xmm0/xmm1 registers are changed by audit module.  */
> -       vpcmpeqq (LRV_SIZE)(%rsp), %xmm0, %xmm2
> +       vpcmpeqb (LRV_SIZE)(%rsp), %xmm0, %xmm2
>         vpmovmskb %xmm2, %esi
> -       cmpl $0xffff, %esi
> +       incw %si
>         jne 1f
>         VMOVA LRV_VECTOR0_OFFSET(%rsp), %VEC(0)
>
> -1:     vpcmpeqq (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
> +1:     vpcmpeqb (LRV_SIZE + XMM_SIZE)(%rsp), %xmm1, %xmm2
>         vpmovmskb %xmm2, %esi
> -       cmpl $0xffff, %esi
> +       incw %si
>         jne 1f
>         VMOVA LRV_VECTOR1_OFFSET(%rsp), %VEC(1)
>
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

     prev parent reply	other threads:[~2022-06-28 18:16 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-06-28 15:26 Noah Goldstein
2022-06-28 15:26 ` [PATCH v1] x86: Add support for building strstr with explicit ISA level Noah Goldstein
2022-06-28 18:20   ` H.J. Lu
2022-06-28 18:24     ` Noah Goldstein
2022-06-28 18:34       ` H.J. Lu
2022-06-28 18:38         ` Noah Goldstein
2022-06-28 15:26 ` [PATCH v1] x86: Add support for building {w}memcmp{eq} " Noah Goldstein
2022-06-29 18:52   ` H.J. Lu
2022-06-29 19:27     ` Noah Goldstein
2022-06-29 19:41       ` H.J. Lu
2022-06-29 19:44         ` Noah Goldstein
2022-06-29 19:48           ` H.J. Lu
2022-06-29 22:09     ` Noah Goldstein
2022-06-29 22:09   ` [PATCH v2] " Noah Goldstein
2022-06-29 22:49     ` H.J. Lu
2022-06-29 23:11       ` Noah Goldstein
2022-06-29 23:11   ` [PATCH v3] " Noah Goldstein
2022-07-01 22:56     ` H.J. Lu
2022-06-28 18:15 ` H.J. Lu [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAMe9rOqLTpyjM4GCePftsQuhe-qWHBDgUb4mpFxOS2SPCa5ogQ@mail.gmail.com \
    --to=hjl.tools@gmail.com \
    --cc=carlos@systemhalted.org \
    --cc=goldstein.w.n@gmail.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).