From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-wm1-x32d.google.com (mail-wm1-x32d.google.com [IPv6:2a00:1450:4864:20::32d]) by sourceware.org (Postfix) with ESMTPS id 7BEC83858CDB for ; Mon, 18 Mar 2024 23:21:19 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.2 sourceware.org 7BEC83858CDB Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=gmail.com Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=gmail.com ARC-Filter: OpenARC Filter v1.0.0 sourceware.org 7BEC83858CDB Authentication-Results: server2.sourceware.org; arc=none smtp.remote-ip=2a00:1450:4864:20::32d ARC-Seal: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1710804082; cv=none; b=uYzJ/h0Yj3qDMnUqbNnR8pxMx58XPT9EKKaABxnO0ROJafMNGp7zpszmbBL949RWK0vuLASh6wccOlqIZ+EnHrvA5l/gOBknpxezC5G2RuRXl+AJ8G/Lg5+iQI63NqwUxS7NHJiAifdb+l7ERZksLfsI0U8hFJ7zHNXshTCuwlw= ARC-Message-Signature: i=1; a=rsa-sha256; d=sourceware.org; s=key; t=1710804082; c=relaxed/simple; bh=5FqxkJuh3Mfb0FHf61P0njX7WR+SKPl169mqv9rdNGA=; h=DKIM-Signature:MIME-Version:From:Date:Message-ID:Subject:To; b=iP3/AG5QYSjO10G+eOP2/RcIlG2tt4i60AfEfKT3tswYrxyluBS4pJ1NMbuhCOTTT8qyikaxBCt7TfCjJlYQnmWQnNWBHMzl+LimW64Wq0VArLjFV+vTS/VSVxK3eCDw/am1vH2eTehEMOkZDKOt5rlhpyBB22Nf5qqehNhN+PI= ARC-Authentication-Results: i=1; server2.sourceware.org Received: by mail-wm1-x32d.google.com with SMTP id 5b1f17b1804b1-4146172fb7eso6146785e9.1 for ; Mon, 18 Mar 2024 16:21:19 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20230601; t=1710804078; x=1711408878; darn=sourceware.org; h=cc:to:subject:message-id:date:from:in-reply-to:references :mime-version:from:to:cc:subject:date:message-id:reply-to; bh=tvMIQWyCQ6nRprYoBkML0XBZmtGX9NLLVBUwT6f2gRk=; b=WrWZXgE0WqxueCQrQ4HVTLtXzXlwgDD2HIM29aE/bDnHKtHbuXK7jcJS1CdXAPh0nl iVgZZnU0P5kRWDnGLEhA8hwTQDrLP7N5vZ3ET/wEmjK5qGItbn8dvx7YMN+KGXx5BbW7 wTeqof7qDnow7Te0Re58BjEn3EFYh0mHVghB51eaKo+F73xJvwWmLMy0CKEr5iGC20Dy OrOktEANJYtS+rmfJdNAi+TrtZKbEEIa0HpRL6Cvg9302rtI4mt107WoyO8xqjONz+wO V4+bBtRLkZl3W9KomX8pxkKPOSzKxqg0bysFjcQmqN82hnTYopK2MmS5o5tvKgwYlPKn KubA== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20230601; t=1710804078; x=1711408878; h=cc:to:subject:message-id:date:from:in-reply-to:references :mime-version:x-gm-message-state:from:to:cc:subject:date:message-id :reply-to; bh=tvMIQWyCQ6nRprYoBkML0XBZmtGX9NLLVBUwT6f2gRk=; b=N+1k07/PVMvpamL2MKkt34N2wpxBGH/5MbgmB+zpcL5t5XtyOcVFH0CkD2W6y1Kiwn t4frdqMGrizWXj5Pf4nKCWaJdFhKDC+eOi1P0kvQzCtzUKuaMO0uZIr6ub0j4zYoHbtR +ZEXXFEbTisLnJyRGArmZx5gipv4SCrVvEhp+JJHA1JNWx6OOkDiiyEnbhg9Cx9zcb5k 13eRZIlRpDpnjPCHo/cdr1ckXngpeV9hbi6P/wNq+wxEjNf9TA/RnifJB7LMwtQxEAlf YLTUeIAOMMyrycaibr6A48mQKZg63CGfQdCm1BiTGXyQ+/mk98Ask7DGftxKDJsQvlJD WYVQ== X-Gm-Message-State: AOJu0Ywvq3/sfFjY+D0CN6hyrqpYnw3bBClFanCeJGqVlCaK8ZJNjt4P LUFOAxkxHHFPrc8bih6kkUpeGs963R2kEXKKQCzPdiKyRrkvxTDPTh88CY31p9Zu4v0i0I+/q+F kIG7Fo3jYtD8rCDs6QAqYRCSOuOk= X-Google-Smtp-Source: AGHT+IGvJaHDLjnaDVH5+ggYZ6/MdOIwTjX5lO0pYPc0THRnrtgynIZvPp1YCV+VUNLOo4CpsUQuO0EaM3JCd7wAD3g= X-Received: by 2002:a05:600c:68c8:b0:413:2a10:8a29 with SMTP id jd8-20020a05600c68c800b004132a108a29mr7187329wmb.13.1710804077966; Mon, 18 Mar 2024 16:21:17 -0700 (PDT) MIME-Version: 1.0 References: <20240318134016.820218-1-hjl.tools@gmail.com> In-Reply-To: <20240318134016.820218-1-hjl.tools@gmail.com> From: Sunil Pandey Date: Mon, 18 Mar 2024 16:20:41 -0700 Message-ID: Subject: Re: [PATCH v6] x86-64: Allocate state buffer space for RDI, RSI and RBX To: "H.J. Lu" Cc: libc-alpha@sourceware.org, fweimer@redhat.com, goldstein.w.n@gmail.com Content-Type: multipart/alternative; boundary="00000000000058fb700613f79d82" X-Spam-Status: No, score=-8.1 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,FREEMAIL_ENVFROM_END_DIGIT,FREEMAIL_FROM,GIT_PATCH_0,HK_RANDOM_ENVFROM,HK_RANDOM_FROM,HTML_MESSAGE,KAM_SHORT,RCVD_IN_DNSWL_NONE,SPF_HELO_NONE,SPF_PASS,TXREP,T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: --00000000000058fb700613f79d82 Content-Type: text/plain; charset="UTF-8" Content-Transfer-Encoding: quoted-printable On Mon, Mar 18, 2024 at 6:40=E2=80=AFAM H.J. Lu wrote: > _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning stack. > After realigning stack, it saves RCX, RDX, R8, R9, R10 and R11. Define > TLSDESC_CALL_REGISTER_SAVE_AREA to allocate space for RDI, RSI and RBX > to avoid clobbering saved RDI, RSI and RBX values on stack by xsave to > STATE_SAVE_OFFSET(%rsp). > > +=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D+<- stack frame= start aligned at 8 or 16 bytes > | |<- RDI saved in the red zone > | |<- RSI saved in the red zone > | |<- RBX saved in the red zone > | |<- paddings for stack realignment of 64 bytes > |------------------|<- xsave buffer end aligned at 64 bytes > | |<- > | |<- > | |<- > |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp) > | |<- 8-byte padding for 64-byte alignment > | |<- 8-byte padding for 64-byte alignment > | |<- R11 > | |<- R10 > | |<- R9 > | |<- R8 > | |<- RDX > | |<- RCX > +=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D+<- RSP aligned= at 64 bytes > > Define TLSDESC_CALL_REGISTER_SAVE_AREA, the total register save area size > for all integer registers by adding 24 to STATE_SAVE_OFFSET since RDI, RSI > and RBX are saved onto stack without adjusting stack pointer first, using > the red-zone. This fixes BZ #31501. > --- > sysdeps/x86/cpu-features.c | 11 ++-- > sysdeps/x86/sysdep.h | 60 ++++++++++++++++++--- > sysdeps/x86_64/tst-gnu2-tls2mod1.S | 87 ++++++++++++++++++++++++++++++ > 3 files changed, 147 insertions(+), 11 deletions(-) > create mode 100644 sysdeps/x86_64/tst-gnu2-tls2mod1.S > > diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c > index 4ea373dffa..3d7c2819d7 100644 > --- a/sysdeps/x86/cpu-features.c > +++ b/sysdeps/x86/cpu-features.c > @@ -311,7 +311,7 @@ update_active (struct cpu_features *cpu_features) > /* NB: On AMX capable processors, ebx always includes AMX > states. */ > unsigned int xsave_state_full_size > - =3D ALIGN_UP (ebx + STATE_SAVE_OFFSET, 64); > + =3D ALIGN_UP (ebx + TLSDESC_CALL_REGISTER_SAVE_AREA, 64); > > cpu_features->xsave_state_size > =3D xsave_state_full_size; > @@ -401,8 +401,10 @@ update_active (struct cpu_features *cpu_features) > unsigned int amx_size > =3D (xstate_amx_comp_offsets[31] > + xstate_amx_comp_sizes[31]); > - amx_size =3D ALIGN_UP (amx_size + STATE_SAVE_OFFSET, > - 64); > + amx_size > + =3D ALIGN_UP ((amx_size > + + TLSDESC_CALL_REGISTER_SAVE_AREA), > + 64); > /* Set xsave_state_full_size to the compact AMX > state size for XSAVEC. NB: xsave_state_full_size > is only used in _dl_tlsdesc_dynamic_xsave and > @@ -410,7 +412,8 @@ update_active (struct cpu_features *cpu_features) > cpu_features->xsave_state_full_size =3D amx_size; > #endif > cpu_features->xsave_state_size > - =3D ALIGN_UP (size + STATE_SAVE_OFFSET, 64); > + =3D ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_A= REA, > + 64); > CPU_FEATURE_SET (cpu_features, XSAVEC); > } > } > diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h > index db8e576e91..7359149e17 100644 > --- a/sysdeps/x86/sysdep.h > +++ b/sysdeps/x86/sysdep.h > @@ -38,14 +38,59 @@ > #ifdef __x86_64__ > /* Offset for fxsave/xsave area used by _dl_runtime_resolve. Also need > space to preserve RCX, RDX, RSI, RDI, R8, R9 and RAX. It must be > - aligned to 16 bytes for fxsave and 64 bytes for xsave. > - > - NB: Is is non-zero because of the 128-byte red-zone. Some registers > - are saved on stack without adjusting stack pointer first. When we > - update stack pointer to allocate more space, we need to take the > - red-zone into account. */ > + aligned to 16 bytes for fxsave and 64 bytes for xsave. It is non-zero > + because MOV, instead of PUSH, is used to save registers onto stack. > + > + +=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D+<- stack fram= e start aligned at 8 or 16 bytes > + | |<- paddings for stack realignment of 64 bytes > + |------------------|<- xsave buffer end aligned at 64 bytes > + | |<- > + | |<- > + | |<- > + |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp) > + | |<- 8-byte padding for 64-byte alignment > + | |<- R9 > + | |<- R8 > + | |<- RDI > + | |<- RSI > + | |<- RDX > + | |<- RCX > + | |<- RAX > + +=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D+<- RSP aligne= d at 64 bytes > + > + */ > # define STATE_SAVE_OFFSET (8 * 7 + 8) > > +/* _dl_tlsdesc_dynamic preserves RDI, RSI and RBX before realigning > + stack. After realigning stack, it saves RCX, RDX, R8, R9, R10 and > + R11. Allocate space for RDI, RSI and RBX to avoid clobbering saved > + RDI, RSI and RBX values on stack by xsave. > + > + +=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D+<- stack fram= e start aligned at 8 or 16 bytes > + | |<- RDI saved in the red zone > + | |<- RSI saved in the red zone > + | |<- RBX saved in the red zone > + | |<- paddings for stack realignment of 64 bytes > + |------------------|<- xsave buffer end aligned at 64 bytes > + | |<- > + | |<- > + | |<- > + |------------------|<- xsave buffer start at STATE_SAVE_OFFSET(%rsp) > + | |<- 8-byte padding for 64-byte alignment > + | |<- 8-byte padding for 64-byte alignment > + | |<- R11 > + | |<- R10 > + | |<- R9 > + | |<- R8 > + | |<- RDX > + | |<- RCX > + +=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D+<- RSP aligne= d at 64 bytes > + > + Define the total register save area size for all integer registers by > + adding 24 to STATE_SAVE_OFFSET since RDI, RSI and RBX are saved onto > + stack without adjusting stack pointer first, using the red-zone. */ > +# define TLSDESC_CALL_REGISTER_SAVE_AREA (STATE_SAVE_OFFSET + 24) > + > /* Save SSE, AVX, AVX512, mask, bound and APX registers. Bound and APX > registers are mutually exclusive. */ > # define STATE_SAVE_MASK \ > @@ -66,8 +111,9 @@ > (STATE_SAVE_MASK | AMX_STATE_SAVE_MASK) > #else > /* Offset for fxsave/xsave area used by _dl_tlsdesc_dynamic. Since i386 > - doesn't have red-zone, use 0 here. */ > + uses PUSH to save registers onto stack, use 0 here. */ > # define STATE_SAVE_OFFSET 0 > +# define TLSDESC_CALL_REGISTER_SAVE_AREA 0 > > /* Save SSE, AVX, AXV512, mask and bound registers. */ > # define STATE_SAVE_MASK \ > diff --git a/sysdeps/x86_64/tst-gnu2-tls2mod1.S > b/sysdeps/x86_64/tst-gnu2-tls2mod1.S > new file mode 100644 > index 0000000000..1d636669ba > --- /dev/null > +++ b/sysdeps/x86_64/tst-gnu2-tls2mod1.S > @@ -0,0 +1,87 @@ > +/* Check if TLSDESC relocation preserves %rdi, %rsi and %rbx. > + Copyright (C) 2024 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + . */ > + > +#include > + > +/* On AVX512 machines, OFFSET =3D=3D 40 caused _dl_tlsdesc_dynamic_xsavec > + to clobber %rdi, %rsi and %rbx. On Intel AVX CPUs, the state size > + is 960 bytes and this test didn't fail. It may be due to the unused > + last 128 bytes. On AMD AVX CPUs, the state size is 832 bytes and > + this test might fail without the fix. */ > +#ifndef OFFSET > +# define OFFSET 40 > +#endif > + > + .text > + .p2align 4 > + .globl apply_tls > + .type apply_tls, @function > +apply_tls: > + cfi_startproc > + _CET_ENDBR > + pushq %rbp > + cfi_def_cfa_offset (16) > + cfi_offset (6, -16) > + movdqu (%RDI_LP), %xmm0 > + lea tls_var1@TLSDESC(%rip), %RAX_LP > + mov %RSP_LP, %RBP_LP > + cfi_def_cfa_register (6) > + /* Align stack to 64 bytes. */ > + and $-64, %RSP_LP > + sub $OFFSET, %RSP_LP > + pushq %rbx > + /* Set %ebx to 0xbadbeef. */ > + movl $0xbadbeef, %ebx > + movl $0xbadbeef, %esi > + movq %rdi, saved_rdi(%rip) > + movq %rsi, saved_rsi(%rip) > + call *tls_var1@TLSCALL(%RAX_LP) > + /* Check if _dl_tlsdesc_dynamic preserves %rdi, %rsi and %rbx. */ > + cmpq saved_rdi(%rip), %rdi > + jne L(hlt) > + cmpq saved_rsi(%rip), %rsi > + jne L(hlt) > + cmpl $0xbadbeef, %ebx > + jne L(hlt) > + add %fs:0, %RAX_LP > + movups %xmm0, 32(%RAX_LP) > + movdqu 16(%RDI_LP), %xmm1 > + mov %RAX_LP, %RBX_LP > + movups %xmm1, 48(%RAX_LP) > + lea 32(%RBX_LP), %RAX_LP > + pop %rbx > + leave > + cfi_def_cfa (7, 8) > + ret > +L(hlt): > + hlt > + cfi_endproc > + .size apply_tls, .-apply_tls > + .hidden tls_var1 > + .globl tls_var1 > + .section .tbss,"awT",@nobits > + .align 16 > + .type tls_var1, @object > + .size tls_var1, 3200 > +tls_var1: > + .zero 3200 > + .local saved_rdi > + .comm saved_rdi,8,8 > + .local saved_rsi > + .comm saved_rsi,8,8 > + .section .note.GNU-stack,"",@progbits > -- > 2.44.0 > > LGTM Reviewed-by: Sunil K Pandey --00000000000058fb700613f79d82--