* [PATCH v4] x86-64: Allocate state buffer space for RDI, RSI and RBX
@ 2024-03-17 12:55 H.J. Lu
2024-03-17 13:02 ` H.J. Lu
2024-03-18 10:22 ` Florian Weimer
0 siblings, 2 replies; 5+ messages in thread
From: H.J. Lu @ 2024-03-17 12:55 UTC (permalink / raw)
To: libc-alpha; +Cc: fweimer
_dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning stack.
After realigning stack, it saves RCX, RDX, R8, R9, R10 and R11. Define
TLSDESC_CALL_REGISTER_SAVE_AREA to allocate space for RDI, RSI and RBX
to avoid clobbering saved RDI, RSI and RBX values on stack by xsave to
STATE_SAVE_OFFSET(%rsp).
+==================+<- stack frame start aligned at 8 or 16 bytes
| |<- RDI
| |<- RSI
| |<- RBX
| |<- paddings from stack realignment of 64 bytes
|------------------|<- xsave buffer end aligned at 64 bytes
| |<-
| |<-
| |<-
|------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)
| |<- 8-byte padding
| |<- 8-byte padding
| |<- R11
| |<- R10
| |<- R9
| |<- R8
| |<- RDX
| |<- RCX
+==================+<- State buffer start aligned at 64 bytes
This fixes BZ #31501.
---
sysdeps/x86/cpu-features.c | 11 ++--
sysdeps/x86/sysdep.h | 29 ++++++++++
sysdeps/x86_64/tst-gnu2-tls2mod1.S | 87 ++++++++++++++++++++++++++++++
3 files changed, 123 insertions(+), 4 deletions(-)
create mode 100644 sysdeps/x86_64/tst-gnu2-tls2mod1.S
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 4ea373dffa..3d7c2819d7 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -311,7 +311,7 @@ update_active (struct cpu_features *cpu_features)
/* NB: On AMX capable processors, ebx always includes AMX
states. */
unsigned int xsave_state_full_size
- = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);
+ = ALIGN_UP (ebx + TLSDESC_CALL_REGISTER_SAVE_AREA, 64);
cpu_features->xsave_state_size
= xsave_state_full_size;
@@ -401,8 +401,10 @@ update_active (struct cpu_features *cpu_features)
unsigned int amx_size
= (xstate_amx_comp_offsets[31]
+ xstate_amx_comp_sizes[31]);
- amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET,
- 64);
+ amx_size
+ = ALIGN_UP ((amx_size
+ + TLSDESC_CALL_REGISTER_SAVE_AREA),
+ 64);
/* Set xsave_state_full_size to the compact AMX
state size for XSAVEC. NB: xsave_state_full_size
is only used in _dl_tlsdesc_dynamic_xsave and
@@ -410,7 +412,8 @@ update_active (struct cpu_features *cpu_features)
cpu_features->xsave_state_full_size = amx_size;
#endif
cpu_features->xsave_state_size
- = ALIGN_UP (size + STATE_SAVE_OFFSET, 64);
+ = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
+ 64);
CPU_FEATURE_SET (cpu_features, XSAVEC);
}
}
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index db8e576e91..46fcd27345 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -46,6 +46,34 @@
red-zone into account. */
# define STATE_SAVE_OFFSET (8 * 7 + 8)
+/* _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning
+ stack. After realigning stack, it saves RCX, RDX, R8, R9, R10 and
+ R11. Allocate space for RDI, RSI and RBX to avoid clobbering saved
+ RDI, RSI and RBX values on stack by xsave.
+
+ +==================+<- stack frame start aligned at 8 or 16 bytes
+ | |<- RDI
+ | |<- RSI
+ | |<- RBX
+ | |<- paddings from stack realignment of 64 bytes
+ |------------------|<- xsave buffer end aligned at 64 bytes
+ | |<-
+ | |<-
+ | |<-
+ |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)
+ | |<- 8-byte padding
+ | |<- 8-byte padding
+ | |<- R11
+ | |<- R10
+ | |<- R9
+ | |<- R8
+ | |<- RDX
+ | |<- RCX
+ +==================+<- State buffer start aligned at 64 bytes
+
+*/
+# define TLSDESC_CALL_REGISTER_SAVE_AREA (STATE_SAVE_OFFSET + 24)
+
/* Save SSE, AVX, AVX512, mask, bound and APX registers. Bound and APX
registers are mutually exclusive. */
# define STATE_SAVE_MASK \
@@ -68,6 +96,7 @@
/* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic. Since i386
doesn't have red-zone, use 0 here. */
# define STATE_SAVE_OFFSET 0
+# define TLSDESC_CALL_REGISTER_SAVE_AREA 0
/* Save SSE, AVX, AXV512, mask and bound registers. */
# define STATE_SAVE_MASK \
diff --git a/sysdeps/x86_64/tst-gnu2-tls2mod1.S b/sysdeps/x86_64/tst-gnu2-tls2mod1.S
new file mode 100644
index 0000000000..449ddd5c9d
--- /dev/null
+++ b/sysdeps/x86_64/tst-gnu2-tls2mod1.S
@@ -0,0 +1,87 @@
+/* Check if TLSDESC relocation preserves %rdi, %rsi and %rbx.
+ Copyright (C) 2024 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* On AVX512 machines, OFFSET == 104 caused _dl_tlsdesc_dynamic_xsavec
+ to clobber %rdi, %rsi and %rbx. On Intel AVX CPUs, the state size
+ is 960 bytes and this test didn't fail. It may be due to the unused
+ last 128 bytes. On AMD AVX CPUs, the state size is 832 bytes and
+ this test might fail without the fix. */
+#ifndef OFFSET
+# define OFFSET 104
+#endif
+
+ .text
+ .p2align 4
+ .globl apply_tls
+ .type apply_tls, @function
+apply_tls:
+ cfi_startproc
+ _CET_ENDBR
+ pushq %rbp
+ cfi_def_cfa_offset (16)
+ cfi_offset (6, -16)
+ movdqu (%RDI_LP), %xmm0
+ lea tls_var1@TLSDESC(%rip), %RAX_LP
+ mov %RSP_LP, %RBP_LP
+ cfi_def_cfa_register (6)
+ /* Align stack to 64 bytes. */
+ and $-64, %RSP_LP
+ sub $OFFSET, %RSP_LP
+ pushq %rbx
+ /* Set %ebx to 0xbadbeef. */
+ movl $0xbadbeef, %ebx
+ movl $0xbadbeef, %esi
+ movq %rdi, saved_rdi(%rip)
+ movq %rsi, saved_rsi(%rip)
+ call *tls_var1@TLSCALL(%RAX_LP)
+ /* Check if _dl_tlsdesc_dynamic preserves %rdi, %rsi and %rbx. */
+ cmpq saved_rdi(%rip), %rdi
+ jne L(hlt)
+ cmpq saved_rsi(%rip), %rsi
+ jne L(hlt)
+ cmpl $0xbadbeef, %ebx
+ jne L(hlt)
+ add %fs:0, %RAX_LP
+ movups %xmm0, 32(%RAX_LP)
+ movdqu 16(%RDI_LP), %xmm1
+ mov %RAX_LP, %RBX_LP
+ movups %xmm1, 48(%RAX_LP)
+ lea 32(%RBX_LP), %RAX_LP
+ pop %rbx
+ leave
+ cfi_def_cfa (7, 8)
+ ret
+L(hlt):
+ hlt
+ cfi_endproc
+ .size apply_tls, .-apply_tls
+ .hidden tls_var1
+ .globl tls_var1
+ .section .tbss,"awT",@nobits
+ .align 16
+ .type tls_var1, @object
+ .size tls_var1, 3200
+tls_var1:
+ .zero 3200
+ .local saved_rdi
+ .comm saved_rdi,8,8
+ .local saved_rsi
+ .comm saved_rsi,8,8
+ .section .note.GNU-stack,"",@progbits
--
2.44.0
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v4] x86-64: Allocate state buffer space for RDI, RSI and RBX
2024-03-17 12:55 [PATCH v4] x86-64: Allocate state buffer space for RDI, RSI and RBX H.J. Lu
@ 2024-03-17 13:02 ` H.J. Lu
2024-03-17 13:04 ` H.J. Lu
2024-03-18 10:22 ` Florian Weimer
1 sibling, 1 reply; 5+ messages in thread
From: H.J. Lu @ 2024-03-17 13:02 UTC (permalink / raw)
To: libc-alpha; +Cc: fweimer
On Sun, Mar 17, 2024 at 5:55 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning stack.
> After realigning stack, it saves RCX, RDX, R8, R9, R10 and R11. Define
> TLSDESC_CALL_REGISTER_SAVE_AREA to allocate space for RDI, RSI and RBX
> to avoid clobbering saved RDI, RSI and RBX values on stack by xsave to
> STATE_SAVE_OFFSET(%rsp).
>
> +==================+<- stack frame start aligned at 8 or 16 bytes
> | |<- RDI
> | |<- RSI
> | |<- RBX
> | |<- paddings from stack realignment of 64 bytes
> |------------------|<- xsave buffer end aligned at 64 bytes
> | |<-
> | |<-
> | |<-
> |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)
> | |<- 8-byte padding
> | |<- 8-byte padding
> | |<- R11
> | |<- R10
> | |<- R9
> | |<- R8
> | |<- RDX
> | |<- RCX
> +==================+<- State buffer start aligned at 64 bytes
>
> This fixes BZ #31501.
> ---
> sysdeps/x86/cpu-features.c | 11 ++--
> sysdeps/x86/sysdep.h | 29 ++++++++++
> sysdeps/x86_64/tst-gnu2-tls2mod1.S | 87 ++++++++++++++++++++++++++++++
> 3 files changed, 123 insertions(+), 4 deletions(-)
> create mode 100644 sysdeps/x86_64/tst-gnu2-tls2mod1.S
>
> diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> index 4ea373dffa..3d7c2819d7 100644
> --- a/sysdeps/x86/cpu-features.c
> +++ b/sysdeps/x86/cpu-features.c
> @@ -311,7 +311,7 @@ update_active (struct cpu_features *cpu_features)
> /* NB: On AMX capable processors, ebx always includes AMX
> states. */
> unsigned int xsave_state_full_size
> - = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);
> + = ALIGN_UP (ebx + TLSDESC_CALL_REGISTER_SAVE_AREA, 64);
>
> cpu_features->xsave_state_size
> = xsave_state_full_size;
> @@ -401,8 +401,10 @@ update_active (struct cpu_features *cpu_features)
> unsigned int amx_size
> = (xstate_amx_comp_offsets[31]
> + xstate_amx_comp_sizes[31]);
> - amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET,
> - 64);
> + amx_size
> + = ALIGN_UP ((amx_size
> + + TLSDESC_CALL_REGISTER_SAVE_AREA),
> + 64);
> /* Set xsave_state_full_size to the compact AMX
> state size for XSAVEC. NB: xsave_state_full_size
> is only used in _dl_tlsdesc_dynamic_xsave and
> @@ -410,7 +412,8 @@ update_active (struct cpu_features *cpu_features)
> cpu_features->xsave_state_full_size = amx_size;
> #endif
> cpu_features->xsave_state_size
> - = ALIGN_UP (size + STATE_SAVE_OFFSET, 64);
> + = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
> + 64);
> CPU_FEATURE_SET (cpu_features, XSAVEC);
> }
> }
> diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> index db8e576e91..46fcd27345 100644
> --- a/sysdeps/x86/sysdep.h
> +++ b/sysdeps/x86/sysdep.h
> @@ -46,6 +46,34 @@
> red-zone into account. */
> # define STATE_SAVE_OFFSET (8 * 7 + 8)
>
> +/* _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning
> + stack. After realigning stack, it saves RCX, RDX, R8, R9, R10 and
> + R11. Allocate space for RDI, RSI and RBX to avoid clobbering saved
> + RDI, RSI and RBX values on stack by xsave.
> +
> + +==================+<- stack frame start aligned at 8 or 16 bytes
> + | |<- RDI
> + | |<- RSI
> + | |<- RBX
> + | |<- paddings from stack realignment of 64 bytes
> + |------------------|<- xsave buffer end aligned at 64 bytes
> + | |<-
> + | |<-
> + | |<-
> + |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)
> + | |<- 8-byte padding
> + | |<- 8-byte padding
> + | |<- R11
> + | |<- R10
> + | |<- R9
> + | |<- R8
> + | |<- RDX
> + | |<- RCX
> + +==================+<- State buffer start aligned at 64 bytes
> +
> +*/
> +# define TLSDESC_CALL_REGISTER_SAVE_AREA (STATE_SAVE_OFFSET + 24)
> +
> /* Save SSE, AVX, AVX512, mask, bound and APX registers. Bound and APX
> registers are mutually exclusive. */
> # define STATE_SAVE_MASK \
> @@ -68,6 +96,7 @@
> /* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic. Since i386
> doesn't have red-zone, use 0 here. */
> # define STATE_SAVE_OFFSET 0
> +# define TLSDESC_CALL_REGISTER_SAVE_AREA 0
>
> /* Save SSE, AVX, AXV512, mask and bound registers. */
> # define STATE_SAVE_MASK \
> diff --git a/sysdeps/x86_64/tst-gnu2-tls2mod1.S b/sysdeps/x86_64/tst-gnu2-tls2mod1.S
> new file mode 100644
> index 0000000000..449ddd5c9d
> --- /dev/null
> +++ b/sysdeps/x86_64/tst-gnu2-tls2mod1.S
> @@ -0,0 +1,87 @@
> +/* Check if TLSDESC relocation preserves %rdi, %rsi and %rbx.
> + Copyright (C) 2024 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <http://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +/* On AVX512 machines, OFFSET == 104 caused _dl_tlsdesc_dynamic_xsavec
> + to clobber %rdi, %rsi and %rbx. On Intel AVX CPUs, the state size
> + is 960 bytes and this test didn't fail. It may be due to the unused
> + last 128 bytes. On AMD AVX CPUs, the state size is 832 bytes and
> + this test might fail without the fix. */
> +#ifndef OFFSET
> +# define OFFSET 104
> +#endif
> +
> + .text
> + .p2align 4
> + .globl apply_tls
> + .type apply_tls, @function
> +apply_tls:
> + cfi_startproc
> + _CET_ENDBR
> + pushq %rbp
> + cfi_def_cfa_offset (16)
> + cfi_offset (6, -16)
> + movdqu (%RDI_LP), %xmm0
> + lea tls_var1@TLSDESC(%rip), %RAX_LP
> + mov %RSP_LP, %RBP_LP
> + cfi_def_cfa_register (6)
> + /* Align stack to 64 bytes. */
> + and $-64, %RSP_LP
> + sub $OFFSET, %RSP_LP
> + pushq %rbx
> + /* Set %ebx to 0xbadbeef. */
> + movl $0xbadbeef, %ebx
> + movl $0xbadbeef, %esi
> + movq %rdi, saved_rdi(%rip)
> + movq %rsi, saved_rsi(%rip)
> + call *tls_var1@TLSCALL(%RAX_LP)
> + /* Check if _dl_tlsdesc_dynamic preserves %rdi, %rsi and %rbx. */
> + cmpq saved_rdi(%rip), %rdi
> + jne L(hlt)
> + cmpq saved_rsi(%rip), %rsi
> + jne L(hlt)
> + cmpl $0xbadbeef, %ebx
> + jne L(hlt)
> + add %fs:0, %RAX_LP
> + movups %xmm0, 32(%RAX_LP)
> + movdqu 16(%RDI_LP), %xmm1
> + mov %RAX_LP, %RBX_LP
> + movups %xmm1, 48(%RAX_LP)
> + lea 32(%RBX_LP), %RAX_LP
> + pop %rbx
> + leave
> + cfi_def_cfa (7, 8)
> + ret
> +L(hlt):
> + hlt
> + cfi_endproc
> + .size apply_tls, .-apply_tls
> + .hidden tls_var1
> + .globl tls_var1
> + .section .tbss,"awT",@nobits
> + .align 16
> + .type tls_var1, @object
> + .size tls_var1, 3200
> +tls_var1:
> + .zero 3200
> + .local saved_rdi
> + .comm saved_rdi,8,8
> + .local saved_rsi
> + .comm saved_rsi,8,8
> + .section .note.GNU-stack,"",@progbits
> --
> 2.44.0
>
I need to adjust assembly codes.
--
H.J.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v4] x86-64: Allocate state buffer space for RDI, RSI and RBX
2024-03-17 13:02 ` H.J. Lu
@ 2024-03-17 13:04 ` H.J. Lu
0 siblings, 0 replies; 5+ messages in thread
From: H.J. Lu @ 2024-03-17 13:04 UTC (permalink / raw)
To: libc-alpha; +Cc: fweimer
On Sun, Mar 17, 2024 at 6:02 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Sun, Mar 17, 2024 at 5:55 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning stack.
> > After realigning stack, it saves RCX, RDX, R8, R9, R10 and R11. Define
> > TLSDESC_CALL_REGISTER_SAVE_AREA to allocate space for RDI, RSI and RBX
> > to avoid clobbering saved RDI, RSI and RBX values on stack by xsave to
> > STATE_SAVE_OFFSET(%rsp).
> >
> > +==================+<- stack frame start aligned at 8 or 16 bytes
> > | |<- RDI
> > | |<- RSI
> > | |<- RBX
> > | |<- paddings from stack realignment of 64 bytes
> > |------------------|<- xsave buffer end aligned at 64 bytes
> > | |<-
> > | |<-
> > | |<-
> > |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)
> > | |<- 8-byte padding
> > | |<- 8-byte padding
> > | |<- R11
> > | |<- R10
> > | |<- R9
> > | |<- R8
> > | |<- RDX
> > | |<- RCX
> > +==================+<- State buffer start aligned at 64 bytes
> >
> > This fixes BZ #31501.
> > ---
> > sysdeps/x86/cpu-features.c | 11 ++--
> > sysdeps/x86/sysdep.h | 29 ++++++++++
> > sysdeps/x86_64/tst-gnu2-tls2mod1.S | 87 ++++++++++++++++++++++++++++++
> > 3 files changed, 123 insertions(+), 4 deletions(-)
> > create mode 100644 sysdeps/x86_64/tst-gnu2-tls2mod1.S
> >
> > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
> > index 4ea373dffa..3d7c2819d7 100644
> > --- a/sysdeps/x86/cpu-features.c
> > +++ b/sysdeps/x86/cpu-features.c
> > @@ -311,7 +311,7 @@ update_active (struct cpu_features *cpu_features)
> > /* NB: On AMX capable processors, ebx always includes AMX
> > states. */
> > unsigned int xsave_state_full_size
> > - = ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64);
> > + = ALIGN_UP (ebx + TLSDESC_CALL_REGISTER_SAVE_AREA, 64);
> >
> > cpu_features->xsave_state_size
> > = xsave_state_full_size;
> > @@ -401,8 +401,10 @@ update_active (struct cpu_features *cpu_features)
> > unsigned int amx_size
> > = (xstate_amx_comp_offsets[31]
> > + xstate_amx_comp_sizes[31]);
> > - amx_size = ALIGN_UP (amx_size + STATE_SAVE_OFFSET,
> > - 64);
> > + amx_size
> > + = ALIGN_UP ((amx_size
> > + + TLSDESC_CALL_REGISTER_SAVE_AREA),
> > + 64);
> > /* Set xsave_state_full_size to the compact AMX
> > state size for XSAVEC. NB: xsave_state_full_size
> > is only used in _dl_tlsdesc_dynamic_xsave and
> > @@ -410,7 +412,8 @@ update_active (struct cpu_features *cpu_features)
> > cpu_features->xsave_state_full_size = amx_size;
> > #endif
> > cpu_features->xsave_state_size
> > - = ALIGN_UP (size + STATE_SAVE_OFFSET, 64);
> > + = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
> > + 64);
> > CPU_FEATURE_SET (cpu_features, XSAVEC);
> > }
> > }
> > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> > index db8e576e91..46fcd27345 100644
> > --- a/sysdeps/x86/sysdep.h
> > +++ b/sysdeps/x86/sysdep.h
> > @@ -46,6 +46,34 @@
> > red-zone into account. */
> > # define STATE_SAVE_OFFSET (8 * 7 + 8)
> >
> > +/* _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning
> > + stack. After realigning stack, it saves RCX, RDX, R8, R9, R10 and
> > + R11. Allocate space for RDI, RSI and RBX to avoid clobbering saved
> > + RDI, RSI and RBX values on stack by xsave.
> > +
> > + +==================+<- stack frame start aligned at 8 or 16 bytes
> > + | |<- RDI
> > + | |<- RSI
> > + | |<- RBX
> > + | |<- paddings from stack realignment of 64 bytes
> > + |------------------|<- xsave buffer end aligned at 64 bytes
> > + | |<-
> > + | |<-
> > + | |<-
> > + |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)
> > + | |<- 8-byte padding
> > + | |<- 8-byte padding
> > + | |<- R11
> > + | |<- R10
> > + | |<- R9
> > + | |<- R8
> > + | |<- RDX
> > + | |<- RCX
> > + +==================+<- State buffer start aligned at 64 bytes
> > +
> > +*/
> > +# define TLSDESC_CALL_REGISTER_SAVE_AREA (STATE_SAVE_OFFSET + 24)
> > +
> > /* Save SSE, AVX, AVX512, mask, bound and APX registers. Bound and APX
> > registers are mutually exclusive. */
> > # define STATE_SAVE_MASK \
> > @@ -68,6 +96,7 @@
> > /* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic. Since i386
> > doesn't have red-zone, use 0 here. */
> > # define STATE_SAVE_OFFSET 0
> > +# define TLSDESC_CALL_REGISTER_SAVE_AREA 0
> >
> > /* Save SSE, AVX, AXV512, mask and bound registers. */
> > # define STATE_SAVE_MASK \
> > diff --git a/sysdeps/x86_64/tst-gnu2-tls2mod1.S b/sysdeps/x86_64/tst-gnu2-tls2mod1.S
> > new file mode 100644
> > index 0000000000..449ddd5c9d
> > --- /dev/null
> > +++ b/sysdeps/x86_64/tst-gnu2-tls2mod1.S
> > @@ -0,0 +1,87 @@
> > +/* Check if TLSDESC relocation preserves %rdi, %rsi and %rbx.
> > + Copyright (C) 2024 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <http://www.gnu.org/licenses/>. */
> > +
> > +#include <sysdep.h>
> > +
> > +/* On AVX512 machines, OFFSET == 104 caused _dl_tlsdesc_dynamic_xsavec
> > + to clobber %rdi, %rsi and %rbx. On Intel AVX CPUs, the state size
> > + is 960 bytes and this test didn't fail. It may be due to the unused
> > + last 128 bytes. On AMD AVX CPUs, the state size is 832 bytes and
> > + this test might fail without the fix. */
> > +#ifndef OFFSET
> > +# define OFFSET 104
> > +#endif
> > +
> > + .text
> > + .p2align 4
> > + .globl apply_tls
> > + .type apply_tls, @function
> > +apply_tls:
> > + cfi_startproc
> > + _CET_ENDBR
> > + pushq %rbp
> > + cfi_def_cfa_offset (16)
> > + cfi_offset (6, -16)
> > + movdqu (%RDI_LP), %xmm0
> > + lea tls_var1@TLSDESC(%rip), %RAX_LP
> > + mov %RSP_LP, %RBP_LP
> > + cfi_def_cfa_register (6)
> > + /* Align stack to 64 bytes. */
> > + and $-64, %RSP_LP
> > + sub $OFFSET, %RSP_LP
> > + pushq %rbx
> > + /* Set %ebx to 0xbadbeef. */
> > + movl $0xbadbeef, %ebx
> > + movl $0xbadbeef, %esi
> > + movq %rdi, saved_rdi(%rip)
> > + movq %rsi, saved_rsi(%rip)
> > + call *tls_var1@TLSCALL(%RAX_LP)
> > + /* Check if _dl_tlsdesc_dynamic preserves %rdi, %rsi and %rbx. */
> > + cmpq saved_rdi(%rip), %rdi
> > + jne L(hlt)
> > + cmpq saved_rsi(%rip), %rsi
> > + jne L(hlt)
> > + cmpl $0xbadbeef, %ebx
> > + jne L(hlt)
> > + add %fs:0, %RAX_LP
> > + movups %xmm0, 32(%RAX_LP)
> > + movdqu 16(%RDI_LP), %xmm1
> > + mov %RAX_LP, %RBX_LP
> > + movups %xmm1, 48(%RAX_LP)
> > + lea 32(%RBX_LP), %RAX_LP
> > + pop %rbx
> > + leave
> > + cfi_def_cfa (7, 8)
> > + ret
> > +L(hlt):
> > + hlt
> > + cfi_endproc
> > + .size apply_tls, .-apply_tls
> > + .hidden tls_var1
> > + .globl tls_var1
> > + .section .tbss,"awT",@nobits
> > + .align 16
> > + .type tls_var1, @object
> > + .size tls_var1, 3200
> > +tls_var1:
> > + .zero 3200
> > + .local saved_rdi
> > + .comm saved_rdi,8,8
> > + .local saved_rsi
> > + .comm saved_rsi,8,8
> > + .section .note.GNU-stack,"",@progbits
> > --
> > 2.44.0
> >
>
> I need to adjust assembly codes.
>
Never mind. Not needed.
--
H.J.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v4] x86-64: Allocate state buffer space for RDI, RSI and RBX
2024-03-17 12:55 [PATCH v4] x86-64: Allocate state buffer space for RDI, RSI and RBX H.J. Lu
2024-03-17 13:02 ` H.J. Lu
@ 2024-03-18 10:22 ` Florian Weimer
2024-03-18 12:32 ` H.J. Lu
1 sibling, 1 reply; 5+ messages in thread
From: Florian Weimer @ 2024-03-18 10:22 UTC (permalink / raw)
To: H.J. Lu; +Cc: libc-alpha
* H. J. Lu:
> _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning stack.
> After realigning stack, it saves RCX, RDX, R8, R9, R10 and R11. Define
> TLSDESC_CALL_REGISTER_SAVE_AREA to allocate space for RDI, RSI and RBX
> to avoid clobbering saved RDI, RSI and RBX values on stack by xsave to
> STATE_SAVE_OFFSET(%rsp).
>
> +==================+<- stack frame start aligned at 8 or 16 bytes
It's 8-byte aligned only? If the caller uses the psABI convention, we
have %rsp ≡ 8 (mod 16).
> | |<- RDI
> | |<- RSI
> | |<- RBX
I would add something like “originally in the red zone” here.
> | |<- paddings from stack realignment of 64 bytes
> |------------------|<- xsave buffer end aligned at 64 bytes
> | |<-
> | |<-
> | |<-
> |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)
> | |<- 8-byte padding
Maybe add “to achieve 64-bit alignment”.
> | |<- 8-byte padding
> | |<- R11
> | |<- R10
> | |<- R9
> | |<- R8
> | |<- RDX
> | |<- RCX
> +==================+<- State buffer start aligned at 64 bytes
Isn't this “%rsp aligned at 64 bytes”?
Likewise for the comment in the sources below.
> diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> index db8e576e91..46fcd27345 100644
> --- a/sysdeps/x86/sysdep.h
> +++ b/sysdeps/x86/sysdep.h
> @@ -46,6 +46,34 @@
> red-zone into account. */
> # define STATE_SAVE_OFFSET (8 * 7 + 8)
The comment on STATE_SAVE_OFFSET needs updating, too.
I would like to see comments from Noah or Sunil. You'll have to
maintain this, too. 8-)
I find the macro consts rather confusing, but maybe that's just me.
Thanks,
Florian
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v4] x86-64: Allocate state buffer space for RDI, RSI and RBX
2024-03-18 10:22 ` Florian Weimer
@ 2024-03-18 12:32 ` H.J. Lu
0 siblings, 0 replies; 5+ messages in thread
From: H.J. Lu @ 2024-03-18 12:32 UTC (permalink / raw)
To: Florian Weimer; +Cc: libc-alpha
On Mon, Mar 18, 2024 at 3:22 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * H. J. Lu:
>
> > _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning stack.
> > After realigning stack, it saves RCX, RDX, R8, R9, R10 and R11. Define
> > TLSDESC_CALL_REGISTER_SAVE_AREA to allocate space for RDI, RSI and RBX
> > to avoid clobbering saved RDI, RSI and RBX values on stack by xsave to
> > STATE_SAVE_OFFSET(%rsp).
> >
> > +==================+<- stack frame start aligned at 8 or 16 bytes
>
> It's 8-byte aligned only? If the caller uses the psABI convention, we
> have %rsp ≡ 8 (mod 16).
It is due to:
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
Do we need to support older compilers?
>
> > | |<- RDI
> > | |<- RSI
> > | |<- RBX
>
> I would add something like “originally in the red zone” here.
Fixed.
> > | |<- paddings from stack realignment of 64 bytes
> > |------------------|<- xsave buffer end aligned at 64 bytes
>
>
> > | |<-
> > | |<-
> > | |<-
> > |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp)
> > | |<- 8-byte padding
>
> Maybe add “to achieve 64-bit alignment”.
Fixed.
> > | |<- 8-byte padding
> > | |<- R11
> > | |<- R10
> > | |<- R9
> > | |<- R8
> > | |<- RDX
> > | |<- RCX
> > +==================+<- State buffer start aligned at 64 bytes
>
> Isn't this “%rsp aligned at 64 bytes”?
Fixed.
> Likewise for the comment in the sources below.
Fixed.
> > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
> > index db8e576e91..46fcd27345 100644
> > --- a/sysdeps/x86/sysdep.h
> > +++ b/sysdeps/x86/sysdep.h
> > @@ -46,6 +46,34 @@
> > red-zone into account. */
> > # define STATE_SAVE_OFFSET (8 * 7 + 8)
>
> The comment on STATE_SAVE_OFFSET needs updating, too.
Fixed.
> I would like to see comments from Noah or Sunil. You'll have to
> maintain this, too. 8-)
CCed Noah and Sunil.
> I find the macro consts rather confusing, but maybe that's just me.
Can you suggest a different name?
The v5 patch is at
https://patchwork.sourceware.org/project/glibc/list/?series=31992
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2024-03-18 12:33 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-03-17 12:55 [PATCH v4] x86-64: Allocate state buffer space for RDI, RSI and RBX H.J. Lu
2024-03-17 13:02 ` H.J. Lu
2024-03-17 13:04 ` H.J. Lu
2024-03-18 10:22 ` Florian Weimer
2024-03-18 12:32 ` H.J. Lu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).