public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Noah Goldstein <goldstein.w.n@gmail.com>
To: "H.J. Lu" <hjl.tools@gmail.com>
Cc: libc-alpha@sourceware.org, fweimer@redhat.com
Subject: Re: [PATCH v3] x86-64: Update _dl_tlsdesc_dynamic to preserve vector registers
Date: Mon, 12 Feb 2024 18:05:36 +0000	[thread overview]
Message-ID: <CAFUsyf+pH1eFwsE2o04UV+UF4pQAFCpJkn92neanPgtrnupHXw@mail.gmail.com> (raw)
In-Reply-To: <20240212132657.3478716-1-hjl.tools@gmail.com>

On Mon, Feb 12, 2024 at 1:27 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> Changes in v3:
>
> 1. Don't add GLRO(dl_x86_64_tlsdesc_dynamic) to libc.a.
>
> Changes in v2:
>
> 1.  Add GLRO(dl_x86_64_runtime_resolve) to optimize
> elf_machine_runtime_setup.
>
> ---
> Compiler generates the following instruction sequence for GNU2 dynamic
> TLS access:
>
>         leaq    tls_var@TLSDESC(%rip), %rax
>         call    *tls_var@TLSCALL(%rax)
>
> CALL instruction may be transparent to compiler which assumes all
> registers, except for RAX, are unchanged after CALL.  At run-time,
> _dl_tlsdesc_dynamic is called, which calls __tls_get_addr on the
> slow path.  __tls_get_addr is a normal function which doesn't
> preserve any caller-saved registers.  _dl_tlsdesc_dynamic saves and
> restores integer caller-saved registers, but doesn't preserve any
> vector registers which are caller-saved.  Add _dl_tlsdesc_dynamic
> IFUNC functions for FXSAVE, XSAVE and XSAVEC to save and restore
> all vector registers.  This fixes BZ #31372.
>
> Add GLRO(dl_x86_64_runtime_resolve) with GLRO(dl_x86_64_tlsdesc_dynamic)
> to optimize elf_machine_runtime_setup.
> ---
>  elf/Makefile                         |  13 ++
>  elf/tst-gnu2-tls2.c                  |  89 ++++++++++++++
>  elf/tst-gnu2-tls2.h                  |  26 ++++
>  elf/tst-gnu2-tls2mod0.c              |  28 +++++
>  elf/tst-gnu2-tls2mod1.c              |  28 +++++
>  elf/tst-gnu2-tls2mod2.c              |  28 +++++
>  sysdeps/x86/cpu-features.c           |  37 ++++++
>  sysdeps/x86_64/dl-machine.h          |  19 +--
>  sysdeps/x86_64/dl-procinfo.c         |  32 +++++
>  sysdeps/x86_64/dl-tlsdesc-dynamic.h  | 170 +++++++++++++++++++++++++++
>  sysdeps/x86_64/dl-tlsdesc.S          | 108 +++++------------
>  sysdeps/x86_64/dl-trampoline-save.h  |  36 ++++++
>  sysdeps/x86_64/dl-trampoline-state.h |  51 ++++++++
>  sysdeps/x86_64/dl-trampoline.S       |  20 +---
>  sysdeps/x86_64/dl-trampoline.h       |  34 +-----
>  15 files changed, 576 insertions(+), 143 deletions(-)
>  create mode 100644 elf/tst-gnu2-tls2.c
>  create mode 100644 elf/tst-gnu2-tls2.h
>  create mode 100644 elf/tst-gnu2-tls2mod0.c
>  create mode 100644 elf/tst-gnu2-tls2mod1.c
>  create mode 100644 elf/tst-gnu2-tls2mod2.c
>  create mode 100644 sysdeps/x86_64/dl-tlsdesc-dynamic.h
>  create mode 100644 sysdeps/x86_64/dl-trampoline-save.h
>  create mode 100644 sysdeps/x86_64/dl-trampoline-state.h
>
> diff --git a/elf/Makefile b/elf/Makefile
> index 5d78b659ce..ff15ec16dd 100644
> --- a/elf/Makefile
> +++ b/elf/Makefile
> @@ -424,6 +424,7 @@ tests += \
>    tst-glibc-hwcaps-prepend \
>    tst-global1 \
>    tst-global2 \
> +  tst-gnu2-tls2 \
>    tst-initfinilazyfail \
>    tst-initorder \
>    tst-initorder2 \
> @@ -846,6 +847,9 @@ modules-names += \
>    tst-filterobj-flt \
>    tst-finilazyfailmod \
>    tst-globalmod2 \
> +  tst-gnu2-tls2mod0 \
> +  tst-gnu2-tls2mod1 \
> +  tst-gnu2-tls2mod2 \
>    tst-initlazyfailmod \
>    tst-initorder2a \
>    tst-initorder2b \
> @@ -3044,8 +3048,17 @@ $(objpfx)tst-tlsgap.out: \
>    $(objpfx)tst-tlsgap-mod0.so \
>    $(objpfx)tst-tlsgap-mod1.so \
>    $(objpfx)tst-tlsgap-mod2.so
> +
> +$(objpfx)tst-gnu2-tls2: $(shared-thread-library)
> +$(objpfx)tst-gnu2-tls2.out: \
> +  $(objpfx)tst-gnu2-tls2mod0.so \
> +  $(objpfx)tst-gnu2-tls2mod1.so \
> +  $(objpfx)tst-gnu2-tls2mod2.so
>  ifeq (yes,$(have-mtls-dialect-gnu2))
>  CFLAGS-tst-tlsgap-mod0.c += -mtls-dialect=gnu2
>  CFLAGS-tst-tlsgap-mod1.c += -mtls-dialect=gnu2
>  CFLAGS-tst-tlsgap-mod2.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod0.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod1.c += -mtls-dialect=gnu2
> +CFLAGS-tst-gnu2-tls2mod2.c += -mtls-dialect=gnu2
>  endif
> diff --git a/elf/tst-gnu2-tls2.c b/elf/tst-gnu2-tls2.c
> new file mode 100644
> index 0000000000..36547efb6d
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2.c
> @@ -0,0 +1,89 @@
> +/* Test TLSDESC relocation.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <stdio.h>
> +#include <string.h>
> +#include <dlfcn.h>
> +#include <pthread.h>
> +#include <support/xdlfcn.h>
> +#include <support/xthread.h>
> +#include <support/check.h>
> +#include "tst-gnu2-tls2.h"
> +
> +static void *mod[3];
> +#define MOD(i) "tst-gnu2-tls2mod" #i ".so"
> +static const char *modname[3] = { MOD(0), MOD(1), MOD(2) };
> +#undef MOD
> +
> +static void
> +open_mod (int i)
> +{
> +  mod[i] = xdlopen (modname[i], RTLD_LAZY);
> +  printf ("open %s\n", modname[i]);
> +}
> +
> +static void
> +close_mod (int i)
> +{
> +  xdlclose (mod[i]);
> +  mod[i] = NULL;
> +  printf ("close %s\n", modname[i]);
> +}
> +
> +static void
> +access_mod (int i, const char *sym)
> +{
> +  struct tls var = { -1, -1, -1, -1 };
> +  struct tls *(*f) (struct tls *) = xdlsym (mod[i], sym);
> +  struct tls *p = f (&var);
> +  printf ("access %s: %s() = %p\n", modname[i], sym, p);
> +  TEST_VERIFY_EXIT (memcmp (p, &var, sizeof (var)) == 0);
> +  ++(p->a);
> +}
> +
> +static void *
> +start (void *arg)
> +{
> +  /* The DTV generation is at the last dlopen of mod0 and the
> +     entry for mod1 is NULL.  */
> +
> +  open_mod (1); /* Reuse modid of mod1. Uses dynamic TLS.  */
> +
> +  /* Force the slow path in GNU2 TLS descriptor call.  */
> +  access_mod (1, "apply_tls");
> +
> +  return arg;
> +}
> +
> +static int
> +do_test (void)
> +{
> +  open_mod (0);
> +  open_mod (1);
> +  open_mod (2);
> +  close_mod (0);
> +  close_mod (1); /* Create modid gap at mod1.  */
> +  open_mod (0); /* Reuse modid of mod0, bump generation count.  */
> +
> +  /* Create a thread where DTV of mod1 is NULL.  */
> +  pthread_t t = xpthread_create (NULL, start, NULL);
> +  xpthread_join (t);
> +  return 0;
> +}
> +
> +#include <support/test-driver.c>
> diff --git a/elf/tst-gnu2-tls2.h b/elf/tst-gnu2-tls2.h
> new file mode 100644
> index 0000000000..e33f4dbe27
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2.h
> @@ -0,0 +1,26 @@
> +/* Test TLSDESC relocation.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <stdint.h>
> +
> +struct tls
> +{
> +  int64_t a, b, c, d;
> +};
> +
> +extern struct tls *apply_tls (struct tls *);
> diff --git a/elf/tst-gnu2-tls2mod0.c b/elf/tst-gnu2-tls2mod0.c
> new file mode 100644
> index 0000000000..67dc0d464d
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod0.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var0 __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> +  tls_var0 = *p;
> +  return &tls_var0;
> +}
> diff --git a/elf/tst-gnu2-tls2mod1.c b/elf/tst-gnu2-tls2mod1.c
> new file mode 100644
> index 0000000000..a4ae6db24f
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod1.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var1[100] __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> +  tls_var1[1] = *p;
> +  return &tls_var1[1];
> +}
> diff --git a/elf/tst-gnu2-tls2mod2.c b/elf/tst-gnu2-tls2mod2.c
> new file mode 100644
> index 0000000000..2d13921717
> --- /dev/null
> +++ b/elf/tst-gnu2-tls2mod2.c
> @@ -0,0 +1,28 @@
> +/* DSO used by tst-gnu2-tls2.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "tst-gnu2-tls2.h"
> +
> +__thread struct tls tls_var2 __attribute__ ((visibility ("hidden")));
> +
> +struct tls *
> +apply_tls (struct tls *p)
> +{
> +  tls_var2 = *p;
> +  return &tls_var2;
> +}
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 25e6622a79..aafdaee3e3 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -47,6 +47,16 @@ TUNABLE_CALLBACK (set_plt_rewrite) (tunable_val_t *valp)
>                  : plt_rewrite_jmp);
>      }
>  }
> +
> +extern void _dl_tlsdesc_dynamic_fxsave (void) attribute_hidden;
> +extern void _dl_tlsdesc_dynamic_xsave (void) attribute_hidden;
> +extern void _dl_tlsdesc_dynamic_xsavec (void) attribute_hidden;
> +#endif
> +
> +#ifdef __x86_64__
> +extern void _dl_runtime_resolve_fxsave (void) attribute_hidden;
> +extern void _dl_runtime_resolve_xsave (void) attribute_hidden;
> +extern void _dl_runtime_resolve_xsavec (void) attribute_hidden;
>  #endif
>
>  #ifdef __LP64__
> @@ -1130,6 +1140,33 @@ no_cpuid:
>                TUNABLE_CALLBACK (set_x86_shstk));
>  #endif
>
> +# ifdef __x86_64__
> +  if (GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC))
> +       {
> +         GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsavec;
> +# ifdef SHARED
> +         GLRO(dl_x86_64_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsavec;
> +# endif
> +       }
> +      else
> +       {
> +         GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_xsave;
> +# ifdef SHARED
> +         GLRO(dl_x86_64_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_xsave;
> +# endif
> +       }
> +    }
> +  else
> +    {
> +      GLRO(dl_x86_64_runtime_resolve) = _dl_runtime_resolve_fxsave;
> +# ifdef SHARED
> +      GLRO(dl_x86_64_tlsdesc_dynamic) = _dl_tlsdesc_dynamic_fxsave;
> +# endif
> +    }
> +#endif
> +
>  #ifdef SHARED
>  # ifdef __x86_64__
>    TUNABLE_GET (plt_rewrite, tunable_val_t *,
> diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
> index 6d605d0d32..74b977fd3c 100644
> --- a/sysdeps/x86_64/dl-machine.h
> +++ b/sysdeps/x86_64/dl-machine.h
> @@ -71,9 +71,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>                            int lazy, int profile)
>  {
>    Elf64_Addr *got;
> -  extern void _dl_runtime_resolve_fxsave (ElfW(Word)) attribute_hidden;
> -  extern void _dl_runtime_resolve_xsave (ElfW(Word)) attribute_hidden;
> -  extern void _dl_runtime_resolve_xsavec (ElfW(Word)) attribute_hidden;
>    extern void _dl_runtime_profile_sse (ElfW(Word)) attribute_hidden;
>    extern void _dl_runtime_profile_avx (ElfW(Word)) attribute_hidden;
>    extern void _dl_runtime_profile_avx512 (ElfW(Word)) attribute_hidden;
> @@ -96,8 +93,6 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>        /* Identify this shared object.  */
>        *(ElfW(Addr) *) (got + 1) = (ElfW(Addr)) l;
>
> -      const struct cpu_features* cpu_features = __get_cpu_features ();
> -
>  #ifdef SHARED
>        /* The got[2] entry contains the address of a function which gets
>          called to get the address of a so far unresolved function and
> @@ -107,6 +102,7 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>          end in this function.  */
>        if (__glibc_unlikely (profile))
>         {
> +         const struct cpu_features* cpu_features = __get_cpu_features ();
>           if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512F))
>             *(ElfW(Addr) *) (got + 2) = (ElfW(Addr)) &_dl_runtime_profile_avx512;
>           else if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX))
> @@ -126,15 +122,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
>           /* This function will get called to fix up the GOT entry
>              indicated by the offset on the stack, and then jump to
>              the resolved address.  */
> -         if (MINIMUM_X86_ISA_LEVEL >= AVX_X86_ISA_LEVEL
> -             || GLRO(dl_x86_cpu_features).xsave_state_size != 0)
> -           *(ElfW(Addr) *) (got + 2)
> -             = (CPU_FEATURE_USABLE_P (cpu_features, XSAVEC)
> -                ? (ElfW(Addr)) &_dl_runtime_resolve_xsavec
> -                : (ElfW(Addr)) &_dl_runtime_resolve_xsave);
> -         else
> -           *(ElfW(Addr) *) (got + 2)
> -             = (ElfW(Addr)) &_dl_runtime_resolve_fxsave;
> +         *(ElfW(Addr) *) (got + 2)
> +           = (ElfW(Addr)) GLRO(dl_x86_64_runtime_resolve);
>         }
>      }
>
> @@ -383,7 +372,7 @@ and creates an unsatisfiable circular dependency.\n",
>                   {
>                     td->arg = _dl_make_tlsdesc_dynamic
>                       (sym_map, sym->st_value + reloc->r_addend);
> -                   td->entry = _dl_tlsdesc_dynamic;
> +                   td->entry = GLRO(dl_x86_64_tlsdesc_dynamic);
>                   }
>                 else
>  #  endif
> diff --git a/sysdeps/x86_64/dl-procinfo.c b/sysdeps/x86_64/dl-procinfo.c
> index 4d1d790fbb..8f317a502c 100644
> --- a/sysdeps/x86_64/dl-procinfo.c
> +++ b/sysdeps/x86_64/dl-procinfo.c
> @@ -41,5 +41,37 @@
>
>  #include <sysdeps/x86/dl-procinfo.c>
>
> +#if defined SHARED && !IS_IN (ldconfig)
> +# if !defined PROCINFO_DECL
> +  ._dl_x86_64_tlsdesc_dynamic
> +# else
> +PROCINFO_CLASS void * _dl_x86_64_tlsdesc_dynamic
> +# endif
> +# ifndef PROCINFO_DECL
> += NULL
> +# endif
> +# ifdef PROCINFO_DECL
> +;
> +# else
> +,
> +# endif
> +#endif
> +
> +#if !IS_IN (ldconfig)
> +# if !defined PROCINFO_DECL && defined SHARED
> +  ._dl_x86_64_runtime_resolve
> +# else
> +PROCINFO_CLASS void * _dl_x86_64_runtime_resolve
> +# endif
> +# ifndef PROCINFO_DECL
> += NULL
> +# endif
> +# if !defined SHARED || defined PROCINFO_DECL
> +;
> +# else
> +,
> +# endif
> +#endif
> +
>  #undef PROCINFO_DECL
>  #undef PROCINFO_CLASS
> diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> new file mode 100644
> index 0000000000..b708d0d9e4
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
> @@ -0,0 +1,170 @@
> +/* Thread-local storage handling in the ELF dynamic linker.  x86_64 version.
> +   Copyright (C) 2004-2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef SECTION
> +# define SECTION(p)    p
> +#endif
> +
> +#undef REGISTER_SAVE_AREA
> +#undef LOCAL_STORAGE_AREA
> +#undef BASE
> +
> +#include "dl-trampoline-state.h"
> +
> +       .section SECTION(.text),"ax",@progbits
> +
> +       .hidden _dl_tlsdesc_dynamic
> +       .global _dl_tlsdesc_dynamic
> +       .type   _dl_tlsdesc_dynamic,@function
> +
> +     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> +       _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> +       tlsdesc_dynamic_arg object.  It must return in %rax the offset
> +       between the thread pointer and the object denoted by the
> +       argument, without clobbering any registers.
> +
> +       The assembly code that follows is a rendition of the following
> +       C code, hand-optimized a little bit.
> +
> +ptrdiff_t
> +_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> +{
> +  struct tlsdesc_dynamic_arg *td = tdp->arg;
> +  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> +  if (__builtin_expect (td->gen_count <= dtv[0].counter
> +                       && (dtv[td->tlsinfo.ti_module].pointer.val
> +                           != TLS_DTV_UNALLOCATED),
> +                       1))
> +    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> +      - __thread_pointer;
> +
> +  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> +}
> +*/
> +       cfi_startproc
> +       .align 16
> +_dl_tlsdesc_dynamic:
> +       _CET_ENDBR
> +       /* Preserve call-clobbered registers that we modify.
> +          We need two scratch regs anyway.  */
> +       movq    %rsi, -16(%rsp)
> +       mov     %fs:DTV_OFFSET, %RSI_LP
> +       movq    %rdi, -8(%rsp)
> +       movq    TLSDESC_ARG(%rax), %rdi
> +       movq    (%rsi), %rax
> +       cmpq    %rax, TLSDESC_GEN_COUNT(%rdi)
> +       ja      2f
> +       movq    TLSDESC_MODID(%rdi), %rax
> +       salq    $4, %rax
> +       movq    (%rax,%rsi), %rax
> +       cmpq    $-1, %rax
> +       je      2f
> +       addq    TLSDESC_MODOFF(%rdi), %rax
> +1:
> +       movq    -16(%rsp), %rsi
> +       sub     %fs:0, %RAX_LP
> +       movq    -8(%rsp), %rdi
> +       ret
> +2:
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       movq    %rbx, -24(%rsp)
> +       mov     %RSP_LP, %RBX_LP
> +       cfi_def_cfa_register(%rbx)
> +       and     $-STATE_SAVE_ALIGNMENT, %RSP_LP
> +#endif
> +#ifdef REGISTER_SAVE_AREA
> +# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       # STATE_SAVE_OFFSET has space for 8 integer registers.  But we
> +       # need space for RCX, RDX, RSI, RDI, R8, R9, R10 and R11, plus
> +       # RBX above.
> +       sub     $(REGISTER_SAVE_AREA + STATE_SAVE_ALIGNMENT), %RSP_LP
> +# else
> +       sub     $REGISTER_SAVE_AREA, %RSP_LP
> +       cfi_adjust_cfa_offset(REGISTER_SAVE_AREA)
> +# endif
> +#else
> +       # Allocate stack space of the required size to save the state.
> +# if IS_IN (rtld)
> +       sub     _rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> +# else
> +       sub      _dl_x86_cpu_features+XSAVE_STATE_SIZE_OFFSET(%rip), %RSP_LP
> +# endif
> +#endif
> +       /* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
> +          r10 and r11.  */
> +       movq    %rcx, REGISTER_SAVE_RCX(%rsp)
> +       movq    %rdx, REGISTER_SAVE_RDX(%rsp)
> +       movq    %r8, REGISTER_SAVE_R8(%rsp)
> +       movq    %r9, REGISTER_SAVE_R9(%rsp)
> +       movq    %r10, REGISTER_SAVE_R10(%rsp)
> +       movq    %r11, REGISTER_SAVE_R11(%rsp)
> +#ifdef USE_FXSAVE
> +       fxsave  STATE_SAVE_OFFSET(%rsp)
> +#else
> +       movl    $STATE_SAVE_MASK, %eax
> +       xorl    %edx, %edx
Do we not need to save/restore components [32:62]?
> +       # Clear the XSAVE Header.
> +# ifdef USE_XSAVE
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8)(%rsp)
> +# endif
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 2)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 3)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 4)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 5)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 6)(%rsp)
> +       movq    %rdx, (STATE_SAVE_OFFSET + 512 + 8 * 7)(%rsp)
What is the purpose of this memset? Isn't this memory going to
be immediately overwritten by the xsave?
> +# ifdef USE_XSAVE
> +       xsave   STATE_SAVE_OFFSET(%rsp)
> +# else
> +       xsavec  STATE_SAVE_OFFSET(%rsp)
> +# endif
> +#endif
> +       /* %rdi already points to the tlsinfo data structure.  */
> +       call    HIDDEN_JUMPTARGET (__tls_get_addr)
> +       # Get register content back.
> +#ifdef USE_FXSAVE
> +       fxrstor STATE_SAVE_OFFSET(%rsp)
> +#else
> +       /* Save and retore __tls_get_addr return value stored in RAX.  */
> +       mov     %RAX_LP, %RCX_LP
> +       movl    $STATE_SAVE_MASK, %eax
> +       xorl    %edx, %edx
> +       xrstor  STATE_SAVE_OFFSET(%rsp)
> +       mov     %RCX_LP, %RAX_LP
> +#endif
> +       movq    REGISTER_SAVE_R11(%rsp), %r11
> +       movq    REGISTER_SAVE_R10(%rsp), %r10
> +       movq    REGISTER_SAVE_R9(%rsp), %r9
> +       movq    REGISTER_SAVE_R8(%rsp), %r8
> +       movq    REGISTER_SAVE_RDX(%rsp), %rdx
> +       movq    REGISTER_SAVE_RCX(%rsp), %rcx
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +       mov     %RBX_LP, %RSP_LP
> +       cfi_def_cfa_register(%rsp)
> +       movq    -24(%rsp), %rbx
> +       cfi_restore(%rbx)
> +#else
> +       add     $REGISTER_SAVE_AREA, %RSP_LP
> +       cfi_adjust_cfa_offset(-REGISTER_SAVE_AREA)
> +#endif
> +       jmp     1b
> +       cfi_endproc
> +       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +
> +#undef STATE_SAVE_ALIGNMENT
> diff --git a/sysdeps/x86_64/dl-tlsdesc.S b/sysdeps/x86_64/dl-tlsdesc.S
> index f748af2ece..ea69f5223a 100644
> --- a/sysdeps/x86_64/dl-tlsdesc.S
> +++ b/sysdeps/x86_64/dl-tlsdesc.S
> @@ -18,7 +18,19 @@
>
>  #include <sysdep.h>
>  #include <tls.h>
> +#include <cpu-features-offsets.h>
> +#include <features-offsets.h>
>  #include "tlsdesc.h"
> +#include "dl-trampoline-save.h"
> +
> +/* Area on stack to save and restore registers used for parameter
> +   passing when calling _dl_tlsdesc_dynamic.  */
> +#define REGISTER_SAVE_RCX      0
> +#define REGISTER_SAVE_RDX      (REGISTER_SAVE_RCX + 8)
> +#define REGISTER_SAVE_R8       (REGISTER_SAVE_RDX + 8)
> +#define REGISTER_SAVE_R9       (REGISTER_SAVE_R8 + 8)
> +#define REGISTER_SAVE_R10      (REGISTER_SAVE_R9 + 8)
> +#define REGISTER_SAVE_R11      (REGISTER_SAVE_R10 + 8)
>
>         .text
>
> @@ -67,80 +79,24 @@ _dl_tlsdesc_undefweak:
>         .size   _dl_tlsdesc_undefweak, .-_dl_tlsdesc_undefweak
>
>  #ifdef SHARED
> -       .hidden _dl_tlsdesc_dynamic
> -       .global _dl_tlsdesc_dynamic
> -       .type   _dl_tlsdesc_dynamic,@function
> -
> -     /* %rax points to the TLS descriptor, such that 0(%rax) points to
> -       _dl_tlsdesc_dynamic itself, and 8(%rax) points to a struct
> -       tlsdesc_dynamic_arg object.  It must return in %rax the offset
> -       between the thread pointer and the object denoted by the
> -       argument, without clobbering any registers.
> -
> -       The assembly code that follows is a rendition of the following
> -       C code, hand-optimized a little bit.
> -
> -ptrdiff_t
> -_dl_tlsdesc_dynamic (register struct tlsdesc *tdp asm ("%rax"))
> -{
> -  struct tlsdesc_dynamic_arg *td = tdp->arg;
> -  dtv_t *dtv = *(dtv_t **)((char *)__thread_pointer + DTV_OFFSET);
> -  if (__builtin_expect (td->gen_count <= dtv[0].counter
> -                       && (dtv[td->tlsinfo.ti_module].pointer.val
> -                           != TLS_DTV_UNALLOCATED),
> -                       1))
> -    return dtv[td->tlsinfo.ti_module].pointer.val + td->tlsinfo.ti_offset
> -      - __thread_pointer;
> -
> -  return __tls_get_addr_internal (&td->tlsinfo) - __thread_pointer;
> -}
> -*/
> -       cfi_startproc
> -       .align 16
> -_dl_tlsdesc_dynamic:
> -       _CET_ENDBR
> -       /* Preserve call-clobbered registers that we modify.
> -          We need two scratch regs anyway.  */
> -       movq    %rsi, -16(%rsp)
> -       mov     %fs:DTV_OFFSET, %RSI_LP
> -       movq    %rdi, -8(%rsp)
> -       movq    TLSDESC_ARG(%rax), %rdi
> -       movq    (%rsi), %rax
> -       cmpq    %rax, TLSDESC_GEN_COUNT(%rdi)
> -       ja      .Lslow
> -       movq    TLSDESC_MODID(%rdi), %rax
> -       salq    $4, %rax
> -       movq    (%rax,%rsi), %rax
> -       cmpq    $-1, %rax
> -       je      .Lslow
> -       addq    TLSDESC_MODOFF(%rdi), %rax
> -.Lret:
> -       movq    -16(%rsp), %rsi
> -       sub     %fs:0, %RAX_LP
> -       movq    -8(%rsp), %rdi
> -       ret
> -.Lslow:
> -       /* Besides rdi and rsi, saved above, save rdx, rcx, r8, r9,
> -          r10 and r11.  Also, align the stack, that's off by 8 bytes.  */
> -       subq    $72, %rsp
> -       cfi_adjust_cfa_offset (72)
> -       movq    %rdx, 8(%rsp)
> -       movq    %rcx, 16(%rsp)
> -       movq    %r8, 24(%rsp)
> -       movq    %r9, 32(%rsp)
> -       movq    %r10, 40(%rsp)
> -       movq    %r11, 48(%rsp)
> -       /* %rdi already points to the tlsinfo data structure.  */
> -       call    HIDDEN_JUMPTARGET (__tls_get_addr)
> -       movq    8(%rsp), %rdx
> -       movq    16(%rsp), %rcx
> -       movq    24(%rsp), %r8
> -       movq    32(%rsp), %r9
> -       movq    40(%rsp), %r10
> -       movq    48(%rsp), %r11
> -       addq    $72, %rsp
> -       cfi_adjust_cfa_offset (-72)
> -       jmp     .Lret
> -       cfi_endproc
> -       .size   _dl_tlsdesc_dynamic, .-_dl_tlsdesc_dynamic
> +# define USE_FXSAVE
> +# define STATE_SAVE_ALIGNMENT  16
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_fxsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_FXSAVE
> +
> +# define USE_XSAVE
> +# define STATE_SAVE_ALIGNMENT  64
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsave
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVE
> +
> +# define USE_XSAVEC
> +# define STATE_SAVE_ALIGNMENT  64
> +# define _dl_tlsdesc_dynamic   _dl_tlsdesc_dynamic_xsavec
> +# include "dl-tlsdesc-dynamic.h"
> +# undef _dl_tlsdesc_dynamic
> +# undef USE_XSAVEC
>  #endif /* SHARED */
> diff --git a/sysdeps/x86_64/dl-trampoline-save.h b/sysdeps/x86_64/dl-trampoline-save.h
> new file mode 100644
> index 0000000000..ebfbfe5c27
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-trampoline-save.h
> @@ -0,0 +1,36 @@
> +/* x86-64 PLT trampoline register save macros.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#ifndef DL_STACK_ALIGNMENT
> +/* Due to GCC bug:
> +
> +   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> +
> +   __tls_get_addr may be called with 8-byte stack alignment.  Although
> +   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> +   that stack will be always aligned at 16 bytes.  We use unaligned
> +   16-byte move to load and store SSE registers, which has no penalty
> +   on modern processors if stack is 16-byte aligned.  */
> +# define DL_STACK_ALIGNMENT 8
> +#endif
> +
> +/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> +   stack to 16 bytes before calling _dl_fixup.  */
> +#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> +  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> +   || 16 > DL_STACK_ALIGNMENT)
> diff --git a/sysdeps/x86_64/dl-trampoline-state.h b/sysdeps/x86_64/dl-trampoline-state.h
> new file mode 100644
> index 0000000000..575f120797
> --- /dev/null
> +++ b/sysdeps/x86_64/dl-trampoline-state.h
> @@ -0,0 +1,51 @@
> +/* x86-64 PLT dl-trampoline state macros.
> +   Copyright (C) 2024 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#if (STATE_SAVE_ALIGNMENT % 16) != 0
> +# error STATE_SAVE_ALIGNMENT must be multiple of 16
> +#endif
> +
> +#if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> +# error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> +#endif
> +
> +#if DL_RUNTIME_RESOLVE_REALIGN_STACK
> +/* Local stack area before jumping to function address: RBX.  */
> +# define LOCAL_STORAGE_AREA    8
> +# define BASE                  rbx
> +# ifdef USE_FXSAVE
> +/* Use fxsave to save XMM registers.  */
> +#  define REGISTER_SAVE_AREA   (512 + STATE_SAVE_OFFSET)
> +#  if (REGISTER_SAVE_AREA % 16) != 0
> +#   error REGISTER_SAVE_AREA must be multiple of 16
> +#  endif
> +# endif
> +#else
> +# ifndef USE_FXSAVE
> +#  error USE_FXSAVE must be defined
> +# endif
> +/* Use fxsave to save XMM registers.  */
> +# define REGISTER_SAVE_AREA    (512 + STATE_SAVE_OFFSET + 8)
> +/* Local stack area before jumping to function address:  All saved
> +   registers.  */
> +# define LOCAL_STORAGE_AREA    REGISTER_SAVE_AREA
> +# define BASE                  rsp
> +# if (REGISTER_SAVE_AREA % 16) != 8
> +#  error REGISTER_SAVE_AREA must be odd multiple of 8
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/dl-trampoline.S b/sysdeps/x86_64/dl-trampoline.S
> index b2e7e0f69b..87c5137837 100644
> --- a/sysdeps/x86_64/dl-trampoline.S
> +++ b/sysdeps/x86_64/dl-trampoline.S
> @@ -22,25 +22,7 @@
>  #include <features-offsets.h>
>  #include <link-defines.h>
>  #include <isa-level.h>
> -
> -#ifndef DL_STACK_ALIGNMENT
> -/* Due to GCC bug:
> -
> -   https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
> -
> -   __tls_get_addr may be called with 8-byte stack alignment.  Although
> -   this bug has been fixed in GCC 4.9.4, 5.3 and 6, we can't assume
> -   that stack will be always aligned at 16 bytes.  We use unaligned
> -   16-byte move to load and store SSE registers, which has no penalty
> -   on modern processors if stack is 16-byte aligned.  */
> -# define DL_STACK_ALIGNMENT 8
> -#endif
> -
> -/* True if _dl_runtime_resolve should align stack for STATE_SAVE or align
> -   stack to 16 bytes before calling _dl_fixup.  */
> -#define DL_RUNTIME_RESOLVE_REALIGN_STACK \
> -  (STATE_SAVE_ALIGNMENT > DL_STACK_ALIGNMENT \
> -   || 16 > DL_STACK_ALIGNMENT)
> +#include "dl-trampoline-save.h"
>
>  /* Area on stack to save and restore registers used for parameter
>     passing when calling _dl_fixup.  */
> diff --git a/sysdeps/x86_64/dl-trampoline.h b/sysdeps/x86_64/dl-trampoline.h
> index f55c6ea040..d9ccfb40d4 100644
> --- a/sysdeps/x86_64/dl-trampoline.h
> +++ b/sysdeps/x86_64/dl-trampoline.h
> @@ -27,39 +27,7 @@
>  # undef LOCAL_STORAGE_AREA
>  # undef BASE
>
> -# if (STATE_SAVE_ALIGNMENT % 16) != 0
> -#  error STATE_SAVE_ALIGNMENT must be multiple of 16
> -# endif
> -
> -# if (STATE_SAVE_OFFSET % STATE_SAVE_ALIGNMENT) != 0
> -#  error STATE_SAVE_OFFSET must be multiple of STATE_SAVE_ALIGNMENT
> -# endif
> -
> -# if DL_RUNTIME_RESOLVE_REALIGN_STACK
> -/* Local stack area before jumping to function address: RBX.  */
> -#  define LOCAL_STORAGE_AREA   8
> -#  define BASE                 rbx
> -#  ifdef USE_FXSAVE
> -/* Use fxsave to save XMM registers.  */
> -#   define REGISTER_SAVE_AREA  (512 + STATE_SAVE_OFFSET)
> -#   if (REGISTER_SAVE_AREA % 16) != 0
> -#    error REGISTER_SAVE_AREA must be multiple of 16
> -#   endif
> -#  endif
> -# else
> -#  ifndef USE_FXSAVE
> -#   error USE_FXSAVE must be defined
> -#  endif
> -/* Use fxsave to save XMM registers.  */
> -#  define REGISTER_SAVE_AREA   (512 + STATE_SAVE_OFFSET + 8)
> -/* Local stack area before jumping to function address:  All saved
> -   registers.  */
> -#  define LOCAL_STORAGE_AREA   REGISTER_SAVE_AREA
> -#  define BASE                 rsp
> -#  if (REGISTER_SAVE_AREA % 16) != 8
> -#   error REGISTER_SAVE_AREA must be odd multiple of 8
> -#  endif
> -# endif
> +# include "dl-trampoline-state.h"
>
>         .globl _dl_runtime_resolve
>         .hidden _dl_runtime_resolve
> --
> 2.43.0
>

  reply	other threads:[~2024-02-12 18:05 UTC|newest]

Thread overview: 3+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-02-12 13:26 H.J. Lu
2024-02-12 18:05 ` Noah Goldstein [this message]
2024-02-12 18:55   ` H.J. Lu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAFUsyf+pH1eFwsE2o04UV+UF4pQAFCpJkn92neanPgtrnupHXw@mail.gmail.com \
    --to=goldstein.w.n@gmail.com \
    --cc=fweimer@redhat.com \
    --cc=hjl.tools@gmail.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).