From: Noah Goldstein <goldstein.w.n@gmail.com>
To: "H.J. Lu" <hjl.tools@gmail.com>
Cc: libc-alpha@sourceware.org, fweimer@redhat.com
Subject: Re: [PATCH v3] x86-64: Update _dl_tlsdesc_dynamic to preserve vector registers
Date: Mon, 12 Feb 2024 18:05:36 +0000 [thread overview]
Message-ID: <CAFUsyf+pH1eFwsE2o04UV+UF4pQAFCpJkn92neanPgtrnupHXw@mail.gmail.com> (raw)
In-Reply-To: <20240212132657.3478716-1-hjl.tools@gmail.com>
On Mon, Feb 12, 2024 at 1:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> Changes in v3:
>
> 1. Don't add GLRO(dl_x86_64_tlsdesc_dynamic) to libc.a.
>
> Changes in v2:
>
> 1. Add GLRO(dl_x86_64_runtime_resolve) to optimize
> elf_machine_runtime_setup.
>
> ---
> Compiler generates the following instruction sequence for GNU2 dynamic
> TLS access:
>
> leaq tls_var@TLSDESC(%rip), %rax
> call *tls_var@TLSCALL(%rax)
>
> CALL instruction may be transparent to compiler which assumes all
> registers, except for RAX, are unchanged after CALL. At run-time,
> _dl_tlsdesc_dynamic is called, which calls __tls_get_addr on the
> slow path. __tls_get_addr is a normal function which doesn't
> preserve any caller-saved registers. _dl_tlsdesc_dynamic saves and
> restores integer caller-saved registers, but doesn't preserve any
> vector registers which are caller-saved. Add _dl_tlsdesc_dynamic
> IFUNC functions for FXSAVE, XSAVE and XSAVEC to save and restore
> all vector registers. This fixes BZ #31372.
>
> Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_64_tlsdesc_dynamic)
> to optimize elf_machine_runtime_setup.
> ---
> elf/Makefile | 13 ++
> elf/tst-gnu2-tls2.c | 89 ++++++++++++++
> elf/tst-gnu2-tls2.h | 26 ++++
> elf/tst-gnu2-tls2mod0.c | 28 +++++
> elf/tst-gnu2-tls2mod1.c | 28 +++++
> elf/tst-gnu2-tls2mod2.c | 28 +++++
> sysdeps/x86/cpu-features.c | 37 ++++++
> sysdeps/x86_64/dl-machine.h | 19 +--
> sysdeps/x86_64/dl-procinfo.c | 32 +++++
> sysdeps/x86_64/dl-tlsdesc-dynamic.h | 170 +++++++++++++++++++++++++++
> sysdeps/x86_64/dl-tlsdesc.S | 108 +++++------------
> sysdeps/x86_64/dl-trampoline-save.h | 36 ++++++
> sysdeps/x86_64/dl-trampoline-state.h | 51 ++++++++
> sysdeps/x86_64/dl-trampoline.S | 20 +---
> sysdeps/x86_64/dl-trampoline.h | 34 +-----
> 15 files changed, 576 insertions(+), 143 deletions(-)
> create mode 100644 elf/tst-gnu2-tls2.c
> create mode 100644 elf/tst-gnu2-tls2.h
> create mode 100644 elf/tst-gnu2-tls2mod0.c
> create mode 100644 elf/tst-gnu2-tls2mod1.c
> create mode 100644 elf/tst-gnu2-tls2mod2.c
> create mode 100644 sysdeps/x86_64/dl-tlsdesc-dynamic.h
> create mode 100644 sysdeps/x86_64/dl-trampoline-save.h
> create mode 100644 sysdeps/x86_64/dl-trampoline-state.h
>
> diff --git a/elf/Makefile b/elf/Makefile
> index 5d78b659ce..ff15ec16dd 100644
> --- a/elf/Makefile
> +++ b/elf/Makefile
> @@ -424,6 +424,7 @@ tests += \
> tst-glibc-hwcaps-prepend \
> tst-global1 \
> tst-global2 \
> + tst-gnu2-tls2 \
> tst-initfinilazyfail \
> tst-initorder \
> tst-initorder2 \
> @@ -846,6 +847,9 @@ modules-names += \
> tst-filterobj-flt \
> tst-finilazyfailmod \
> tst-globalmod2 \
> + tst-gnu2-tls2mod0 \
> + tst-gnu2-tls2mod1 \
> + tst-gnu2-tls2mod2 \
> tst-initlazyfailmod \
> tst-initorder2a \
> tst-initorder2b \
> @@ -3044,8 +3048,17 @@ $(objpfx)tst-tlsgap.out: \
> $(objpfx)tst-tlsgap-mod0.so \
> $(objpfx)tst-tlsgap-mod1.so \
> $(objpfx)tst-tlsgap-mod2.so
> +
> +$(objpfx)tst-gnu2-tls2: $(shared-thread-library)
> +$(objpfx)tst-gnu2-tls2.out: \
> + $(objpfx)tst-gnu2-tls2mod0.so \
> + $(objpfx)tst-gnu2-tls2mod1.so \
> + $(objpfx)tst-gnu2-tls2mod2.so
> ifeq (yes,$(have-mtls-dialect-gnu2))
> CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
> CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
> CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2
> endif
> diff --git a/elf/tst-gnu2-tls2.c b/elf/tst-gnu2-tls2.c
> new file mode 100644
> index 0000000000..36547efb6d
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2.c
> @@ -0,0 +1,89 @@
> +/* Test TLSDESC relocation.
> + Copyright (C) 2024 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <stdio.h>
> +#include <string.h>
> +#include <dlfcn.h>
> +#include <pthread.h>
> +#include <support/xdlfcn.h>
> +#include <support/xthread.h>
> +#include <support/check.h>
> +#include "tst-gnu2-tls2.h"
> +
> +static void *mod[3];
> +#define MOD(i) "tst-gnu2-tls2mod" #i ".so"
> +static const char *modname[3] = { MOD(0), MOD(1), MOD(2) };
> +#undef MOD
> +
> +static void
> +open_mod (int i)
> +{
> + mod[i] = xdlopen (modname[i], RTLD_LAZY);
> + printf ("open %s\n", modname[i]);
> +}
> +
> +static void
> +close_mod (int i)
> +{
> + xdlclose (mod[i]);
> + mod[i] = NULL;
> + printf ("close %s\n", modname[i]);
> +}
> +
> +static void
> +access_mod (int i, const char *sym)
> +{
> + struct tls var = { -1, -1, -1, -1 };
> + struct tls *(*f) (struct tls *) = xdlsym (mod[i], sym);
> + struct tls *p = f (&var);
> + printf ("access %s: %s() = %p\n", modname[i], sym, p);
> + TEST_VERIFY_EXIT (memcmp (p, &var, sizeof (var)) == 0);
> + ++(p->a);
> +}
> +
> +static void *
> +start (void *arg)
> +{
> + /* The DTV generation is at the last dlopen of mod0 and the
> + entry for mod1 is NULL. */
> +
> + open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS. */
> +
> + /* Force the slow path in GNU2 TLS descriptor call. */
> + access_mod (1, "apply_tls");
> +
> + return arg;
> +}
> +
> +static int
> +do_test (void)
> +{
> + open_mod (0);
> + open_mod (1);
> + open_mod (2);
> + close_mod (0);
> + close_mod (1); /* Create modid gap at mod1. */
> + open_mod (0); /* Reuse modid of mod0, bump generation count. */
> +
> + /* Create a thread where DTV of mod1 is NULL. */
> + pthread_t t = xpthread_create (NULL, start, NULL);
> + xpthread_join (t);
> + return 0;
> +}
> +
> +#include <support/test-driver.c>
> diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h
> new file mode 100644
> index 0000000000..e33f4dbe27
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2.h
> @@ -0,0 +1,26 @@
> +/* Test TLSDESC relocation.
> + Copyright (C) 2024 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <stdint.h>
> +
> +struct tls
> +{
> + int64_t a, b, c, d;
> +};
> +
> +extern struct tls *apply_tls (struct tls *);
> diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c
> new file mode 100644
> index 0000000000..67dc0d464d
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod0.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> + Copyright (C) 2024 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var0 __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> + tls_var0 = *p;
> + return &tls_var0;
> +}
> diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c
> new file mode 100644
> index 0000000000..a4ae6db24f
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod1.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> + Copyright (C) 2024 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> + tls_var1[1] = *p;
> + return &tls_var1[1];
> +}
> diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c
> new file mode 100644
> index 0000000000..2d13921717
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod2.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> + Copyright (C) 2024 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var2 __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> + tls_var2 = *p;
> + return &tls_var2;
> +}
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 25e6622a79..aafdaee3e3 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -47,6 +47,16 @@ TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
> : plt_rewrite_jmp);
> }
> }
> +
> +extern void _dl_tlsdesc_dynamic_fxsave (void) attribute_hidden;
> +extern void _dl_tlsdesc_dynamic_xsave (void) attribute_hidden;
> +extern void _dl_tlsdesc_dynamic_xsavec (void) attribute_hidden;
> +#endif
> +
> +#ifdef __x86_64__
> +extern void _dl_runtime_resolve_fxsave (void) attribute_hidden;
> +extern void _dl_runtime_resolve_xsave (void) attribute_hidden;
> +extern void _dl_runtime_resolve_xsavec (void) attribute_hidden;
> #endif
>
> #ifdef __LP64__
> @@ -1130,6 +1140,33 @@ no_cpuid:
> TUNABLE_CALLBACK (set_x86_shstk));
> #endif
>
> +# ifdef __x86_64__
> + if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> + {
> + if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC))
> + {
> + GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsavec;
> +# ifdef SHARED
> + GLRO(dl_x86_64_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsavec;
> +# endif
> + }
> + else
> + {
> + GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsave;
> +# ifdef SHARED
> + GLRO(dl_x86_64_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsave;
> +# endif
> + }
> + }
> + else
> + {
> + GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_fxsave;
> +# ifdef SHARED
> + GLRO(dl_x86_64_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
> +# endif
> + }
> +#endif
> +
> #ifdef SHARED
> # ifdef __x86_64__
> TUNABLE_GET (plt_rewrite, tunable_val_t *,
> diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> index 6d605d0d32..74b977fd3c 100644
> --- a/sysdeps/x86_64/dl-machine.h
> +++ b/sysdeps/x86_64/dl-machine.h
> @@ -71,9 +71,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> int lazy, int profile)
> {
> Elf64_Addr *got;
> - extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
> - extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
> - extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
> extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
> extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
> extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
> @@ -96,8 +93,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> /* Identify this shared object. */
> *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
>
> - const struct cpu_features* cpu_features = __get_cpu_features ();
> -
> #ifdef SHARED
> /* The got[2] entry contains the address of a function which gets
> called to get the address of a so far unresolved function and
> @@ -107,6 +102,7 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> end in this function. */
> if (__glibc_unlikely (profile))
> {
> + const struct cpu_features* cpu_features = __get_cpu_features ();
> if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
> *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
> else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> @@ -126,15 +122,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
> /* This function will get called to fix up the GOT entry
> indicated by the offset on the stack, and then jump to
> the resolved address. */
> - if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> - || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> - *(ElfW(Addr) *) (got + 2)
> - = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> - ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> - : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> - else
> - *(ElfW(Addr) *) (got + 2)
> - = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
> + *(ElfW(Addr) *) (got + 2)
> + = (ElfW(Addr)) GLRO(dl_x86_64_runtime_resolve);
> }
> }
>
> @@ -383,7 +372,7 @@ and creates an unsatisfiable circular dependency.\n",
> {
> td->arg = _dl_make_tlsdesc_dynamic
> (sym_map, sym->st_value + reloc->r_addend);
> - td->entry = _dl_tlsdesc_dynamic;
> + td->entry = GLRO(dl_x86_64_tlsdesc_dynamic);
> }
> else
> # endif
> diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
> index 4d1d790fbb..8f317a502c 100644
> --- a/sysdeps/x86_64/dl-procinfo.c
> +++ b/sysdeps/x86_64/dl-procinfo.c
> @@ -41,5 +41,37 @@
>
> #include <sysdeps/x86/dl-procinfo.c>
>
> +#if defined SHARED && !IS_IN (ldconfig)
> +# if !defined PROCINFO_DECL
> + ._dl_x86_64_tlsdesc_dynamic
> +# else
> +PROCINFO_CLASS void * _dl_x86_64_tlsdesc_dynamic
> +# endif
> +# ifndef PROCINFO_DECL
> += NULL
> +# endif
> +# ifdef PROCINFO_DECL
> +;
> +# else
> +,
> +# endif
> +#endif
> +
> +#if !IS_IN (ldconfig)
> +# if !defined PROCINFO_DECL && defined SHARED
> + ._dl_x86_64_runtime_resolve
> +# else
> +PROCINFO_CLASS void * _dl_x86_64_runtime_resolve
> +# endif
> +# ifndef PROCINFO_DECL
> += NULL
> +# endif
> +# if !defined SHARED || defined PROCINFO_DECL
> +;
> +# else
> +,
> +# endif
> +#endif
> +
> #undef PROCINFO_DECL
> #undef PROCINFO_CLASS
> diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> new file mode 100644
> index 0000000000..b708d0d9e4
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> @@ -0,0 +1,170 @@
> +/* Thread-local storage handling in the ELF dynamic linker. x86_64 version.
> + Copyright (C) 2004-2024 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef SECTION
> +# define SECTION(p) p
> +#endif
> +
> +#undef REGISTER_SAVE_AREA
> +#undef LOCAL_STORAGE_AREA
> +#undef BASE
> +
> +#include "dl-trampoline-state.h"
> +
> + .section SECTION(.text),"ax",@progbits
> +
> + .hidden _dl_tlsdesc_dynamic
> + .global _dl_tlsdesc_dynamic
> + .type _dl_tlsdesc_dynamic,@function
> +
> + /* %rax points to the TLS descriptor, such that 0(%rax) points to
> + _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> + tlsdesc_dynamic_arg object. It must return in %rax the offset
> + between the thread pointer and the object denoted by the
> + argument, without clobbering any registers.
> +
> + The assembly code that follows is a rendition of the following
> + C code, hand-optimized a little bit.
> +
> +ptrdiff_t
> +_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> +{
> + struct tlsdesc_dynamic_arg *td = tdp->arg;
> + dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> + if (__builtin_expect (td->gen_count <= dtv[0].counter
> + && (dtv[td->tlsinfo.ti_module].pointer.val
> + != TLS_DTV_UNALLOCATED),
> + 1))
> + return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> + - __thread_pointer;
> +
> + return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> +}
> +*/
> + cfi_startproc
> + .align 16
> +_dl_tlsdesc_dynamic:
> + _CET_ENDBR
> + /* Preserve call-clobbered registers that we modify.
> + We need two scratch regs anyway. */
> + movq %rsi, -16(%rsp)
> + mov %fs:DTV_OFFSET, %RSI_LP
> + movq %rdi, -8(%rsp)
> + movq TLSDESC_ARG(%rax), %rdi
> + movq (%rsi), %rax
> + cmpq %rax, TLSDESC_GEN_COUNT(%rdi)
> + ja 2f
> + movq TLSDESC_MODID(%rdi), %rax
> + salq $4, %rax
> + movq (%rax,%rsi), %rax
> + cmpq $-1, %rax
> + je 2f
> + addq TLSDESC_MODOFF(%rdi), %rax
> +1:
> + movq -16(%rsp), %rsi
> + sub %fs:0, %RAX_LP
> + movq -8(%rsp), %rdi
> + ret
> +2:
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> + movq %rbx, -24(%rsp)
> + mov %RSP_LP, %RBX_LP
> + cfi_def_cfa_register(%rbx)
> + and $-STATE_SAVE_ALIGNMENT, %RSP_LP
> +#endif
> +#ifdef REGISTER_SAVE_AREA
> +# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> + # STATE_SAVE_OFFSET has space for 8 integer registers. But we
> + # need space for RCX, RDX, RSI, RDI, R8, R9, R10 and R11, plus
> + # RBX above.
> + sub $(REGISTER_SAVE_AREA + STATE_SAVE_ALIGNMENT), %RSP_LP
> +# else
> + sub $REGISTER_SAVE_AREA, %RSP_LP
> + cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> +# endif
> +#else
> + # Allocate stack space of the required size to save the state.
> +# if IS_IN (rtld)
> + sub _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> +# else
> + sub _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> +# endif
> +#endif
> + /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
> + r10 and r11. */
> + movq %rcx, REGISTER_SAVE_RCX(%rsp)
> + movq %rdx, REGISTER_SAVE_RDX(%rsp)
> + movq %r8, REGISTER_SAVE_R8(%rsp)
> + movq %r9, REGISTER_SAVE_R9(%rsp)
> + movq %r10, REGISTER_SAVE_R10(%rsp)
> + movq %r11, REGISTER_SAVE_R11(%rsp)
> +#ifdef USE_FXSAVE
> + fxsave STATE_SAVE_OFFSET(%rsp)
> +#else
> + movl $STATE_SAVE_MASK, %eax
> + xorl %edx, %edx
Do we not need to save/restore components [32:62]?
> + # Clear the XSAVE Header.
> +# ifdef USE_XSAVE
> + movq %rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
> + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
> +# endif
> + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
> + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
> + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
> + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
> + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
> + movq %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
What is the purpose of this memset? Isn't this memory going to
be immediately overwritten by the xsave?
> +# ifdef USE_XSAVE
> + xsave STATE_SAVE_OFFSET(%rsp)
> +# else
> + xsavec STATE_SAVE_OFFSET(%rsp)
> +# endif
> +#endif
> + /* %rdi already points to the tlsinfo data structure. */
> + call HIDDEN_JUMPTARGET (__tls_get_addr)
> + # Get register content back.
> +#ifdef USE_FXSAVE
> + fxrstor STATE_SAVE_OFFSET(%rsp)
> +#else
> + /* Save and retore __tls_get_addr return value stored in RAX. */
> + mov %RAX_LP, %RCX_LP
> + movl $STATE_SAVE_MASK, %eax
> + xorl %edx, %edx
> + xrstor STATE_SAVE_OFFSET(%rsp)
> + mov %RCX_LP, %RAX_LP
> +#endif
> + movq REGISTER_SAVE_R11(%rsp), %r11
> + movq REGISTER_SAVE_R10(%rsp), %r10
> + movq REGISTER_SAVE_R9(%rsp), %r9
> + movq REGISTER_SAVE_R8(%rsp), %r8
> + movq REGISTER_SAVE_RDX(%rsp), %rdx
> + movq REGISTER_SAVE_RCX(%rsp), %rcx
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> + mov %RBX_LP, %RSP_LP
> + cfi_def_cfa_register(%rsp)
> + movq -24(%rsp), %rbx
> + cfi_restore(%rbx)
> +#else
> + add $REGISTER_SAVE_AREA, %RSP_LP
> + cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
> +#endif
> + jmp 1b
> + cfi_endproc
> + .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +
> +#undef STATE_SAVE_ALIGNMENT
> diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S
> index f748af2ece..ea69f5223a 100644
> --- a/sysdeps/x86_64/dl-tlsdesc.S
> +++ b/sysdeps/x86_64/dl-tlsdesc.S
> @@ -18,7 +18,19 @@
>
> #include <sysdep.h>
> #include <tls.h>
> +#include <cpu-features-offsets.h>
> +#include <features-offsets.h>
> #include "tlsdesc.h"
> +#include "dl-trampoline-save.h"
> +
> +/* Area on stack to save and restore registers used for parameter
> + passing when calling _dl_tlsdesc_dynamic. */
> +#define REGISTER_SAVE_RCX 0
> +#define REGISTER_SAVE_RDX (REGISTER_SAVE_RCX + 8)
> +#define REGISTER_SAVE_R8 (REGISTER_SAVE_RDX + 8)
> +#define REGISTER_SAVE_R9 (REGISTER_SAVE_R8 + 8)
> +#define REGISTER_SAVE_R10 (REGISTER_SAVE_R9 + 8)
> +#define REGISTER_SAVE_R11 (REGISTER_SAVE_R10 + 8)
>
> .text
>
> @@ -67,80 +79,24 @@ _dl_tlsdesc_undefweak:
> .size _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
>
> #ifdef SHARED
> - .hidden _dl_tlsdesc_dynamic
> - .global _dl_tlsdesc_dynamic
> - .type _dl_tlsdesc_dynamic,@function
> -
> - /* %rax points to the TLS descriptor, such that 0(%rax) points to
> - _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> - tlsdesc_dynamic_arg object. It must return in %rax the offset
> - between the thread pointer and the object denoted by the
> - argument, without clobbering any registers.
> -
> - The assembly code that follows is a rendition of the following
> - C code, hand-optimized a little bit.
> -
> -ptrdiff_t
> -_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> -{
> - struct tlsdesc_dynamic_arg *td = tdp->arg;
> - dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> - if (__builtin_expect (td->gen_count <= dtv[0].counter
> - && (dtv[td->tlsinfo.ti_module].pointer.val
> - != TLS_DTV_UNALLOCATED),
> - 1))
> - return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> - - __thread_pointer;
> -
> - return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> -}
> -*/
> - cfi_startproc
> - .align 16
> -_dl_tlsdesc_dynamic:
> - _CET_ENDBR
> - /* Preserve call-clobbered registers that we modify.
> - We need two scratch regs anyway. */
> - movq %rsi, -16(%rsp)
> - mov %fs:DTV_OFFSET, %RSI_LP
> - movq %rdi, -8(%rsp)
> - movq TLSDESC_ARG(%rax), %rdi
> - movq (%rsi), %rax
> - cmpq %rax, TLSDESC_GEN_COUNT(%rdi)
> - ja .Lslow
> - movq TLSDESC_MODID(%rdi), %rax
> - salq $4, %rax
> - movq (%rax,%rsi), %rax
> - cmpq $-1, %rax
> - je .Lslow
> - addq TLSDESC_MODOFF(%rdi), %rax
> -.Lret:
> - movq -16(%rsp), %rsi
> - sub %fs:0, %RAX_LP
> - movq -8(%rsp), %rdi
> - ret
> -.Lslow:
> - /* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9,
> - r10 and r11. Also, align the stack, that's off by 8 bytes. */
> - subq $72, %rsp
> - cfi_adjust_cfa_offset (72)
> - movq %rdx, 8(%rsp)
> - movq %rcx, 16(%rsp)
> - movq %r8, 24(%rsp)
> - movq %r9, 32(%rsp)
> - movq %r10, 40(%rsp)
> - movq %r11, 48(%rsp)
> - /* %rdi already points to the tlsinfo data structure. */
> - call HIDDEN_JUMPTARGET (__tls_get_addr)
> - movq 8(%rsp), %rdx
> - movq 16(%rsp), %rcx
> - movq 24(%rsp), %r8
> - movq 32(%rsp), %r9
> - movq 40(%rsp), %r10
> - movq 48(%rsp), %r11
> - addq $72, %rsp
> - cfi_adjust_cfa_offset (-72)
> - jmp .Lret
> - cfi_endproc
> - .size _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +# define USE_FXSAVE
> +# define STATE_SAVE_ALIGNMENT 16
> +# define _dl_tlsdesc_dynamic _dl_tlsdesc_dynamic_fxsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_FXSAVE
> +
> +# define USE_XSAVE
> +# define STATE_SAVE_ALIGNMENT 64
> +# define _dl_tlsdesc_dynamic _dl_tlsdesc_dynamic_xsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVE
> +
> +# define USE_XSAVEC
> +# define STATE_SAVE_ALIGNMENT 64
> +# define _dl_tlsdesc_dynamic _dl_tlsdesc_dynamic_xsavec
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVEC
> #endif /* SHARED */
> diff --git a/sysdeps/x86_64/dl-trampoline-save.h b/sysdeps/x86_64/dl-trampoline-save.h
> new file mode 100644
> index 0000000000..ebfbfe5c27
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-trampoline-save.h
> @@ -0,0 +1,36 @@
> +/* x86-64 PLT trampoline register save macros.
> + Copyright (C) 2024 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#ifndef DL_STACK_ALIGNMENT
> +/* Due to GCC bug:
> +
> + https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> +
> + __tls_get_addr may be called with 8-byte stack alignment. Although
> + this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> + that stack will be always aligned at 16 bytes. We use unaligned
> + 16-byte move to load and store SSE registers, which has no penalty
> + on modern processors if stack is 16-byte aligned. */
> +# define DL_STACK_ALIGNMENT 8
> +#endif
> +
> +/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> + stack to 16 bytes before calling _dl_fixup. */
> +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> + (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> + || 16 > DL_STACK_ALIGNMENT)
> diff --git a/sysdeps/x86_64/dl-trampoline-state.h b/sysdeps/x86_64/dl-trampoline-state.h
> new file mode 100644
> index 0000000000..575f120797
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-trampoline-state.h
> @@ -0,0 +1,51 @@
> +/* x86-64 PLT dl-trampoline state macros.
> + Copyright (C) 2024 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#if (STATE_SAVE_ALIGNMENT % 16) != 0
> +# error STATE_SAVE_ALIGNMENT must be multiple of 16
> +#endif
> +
> +#if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> +# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> +#endif
> +
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +/* Local stack area before jumping to function address: RBX. */
> +# define LOCAL_STORAGE_AREA 8
> +# define BASE rbx
> +# ifdef USE_FXSAVE
> +/* Use fxsave to save XMM registers. */
> +# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET)
> +# if (REGISTER_SAVE_AREA % 16) != 0
> +# error REGISTER_SAVE_AREA must be multiple of 16
> +# endif
> +# endif
> +#else
> +# ifndef USE_FXSAVE
> +# error USE_FXSAVE must be defined
> +# endif
> +/* Use fxsave to save XMM registers. */
> +# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET + 8)
> +/* Local stack area before jumping to function address: All saved
> + registers. */
> +# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA
> +# define BASE rsp
> +# if (REGISTER_SAVE_AREA % 16) != 8
> +# error REGISTER_SAVE_AREA must be odd multiple of 8
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> index b2e7e0f69b..87c5137837 100644
> --- a/sysdeps/x86_64/dl-trampoline.S
> +++ b/sysdeps/x86_64/dl-trampoline.S
> @@ -22,25 +22,7 @@
> #include <features-offsets.h>
> #include <link-defines.h>
> #include <isa-level.h>
> -
> -#ifndef DL_STACK_ALIGNMENT
> -/* Due to GCC bug:
> -
> - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> -
> - __tls_get_addr may be called with 8-byte stack alignment. Although
> - this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> - that stack will be always aligned at 16 bytes. We use unaligned
> - 16-byte move to load and store SSE registers, which has no penalty
> - on modern processors if stack is 16-byte aligned. */
> -# define DL_STACK_ALIGNMENT 8
> -#endif
> -
> -/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> - stack to 16 bytes before calling _dl_fixup. */
> -#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> - (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> - || 16 > DL_STACK_ALIGNMENT)
> +#include "dl-trampoline-save.h"
>
> /* Area on stack to save and restore registers used for parameter
> passing when calling _dl_fixup. */
> diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
> index f55c6ea040..d9ccfb40d4 100644
> --- a/sysdeps/x86_64/dl-trampoline.h
> +++ b/sysdeps/x86_64/dl-trampoline.h
> @@ -27,39 +27,7 @@
> # undef LOCAL_STORAGE_AREA
> # undef BASE
>
> -# if (STATE_SAVE_ALIGNMENT % 16) != 0
> -# error STATE_SAVE_ALIGNMENT must be multiple of 16
> -# endif
> -
> -# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> -# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> -# endif
> -
> -# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> -/* Local stack area before jumping to function address: RBX. */
> -# define LOCAL_STORAGE_AREA 8
> -# define BASE rbx
> -# ifdef USE_FXSAVE
> -/* Use fxsave to save XMM registers. */
> -# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET)
> -# if (REGISTER_SAVE_AREA % 16) != 0
> -# error REGISTER_SAVE_AREA must be multiple of 16
> -# endif
> -# endif
> -# else
> -# ifndef USE_FXSAVE
> -# error USE_FXSAVE must be defined
> -# endif
> -/* Use fxsave to save XMM registers. */
> -# define REGISTER_SAVE_AREA (512 + STATE_SAVE_OFFSET + 8)
> -/* Local stack area before jumping to function address: All saved
> - registers. */
> -# define LOCAL_STORAGE_AREA REGISTER_SAVE_AREA
> -# define BASE rsp
> -# if (REGISTER_SAVE_AREA % 16) != 8
> -# error REGISTER_SAVE_AREA must be odd multiple of 8
> -# endif
> -# endif
> +# include "dl-trampoline-state.h"
>
> .globl _dl_runtime_resolve
> .hidden _dl_runtime_resolve
> --
> 2.43.0
>
next prev parent reply other threads:[~2024-02-12 18:05 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-02-12 13:26 H.J. Lu
2024-02-12 18:05 ` Noah Goldstein [this message]
2024-02-12 18:55 ` H.J. Lu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CAFUsyf+pH1eFwsE2o04UV+UF4pQAFCpJkn92neanPgtrnupHXw@mail.gmail.com \
--to=goldstein.w.n@gmail.com \
--cc=fweimer@redhat.com \
--cc=hjl.tools@gmail.com \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).