From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from linux.microsoft.com (linux.microsoft.com [13.77.154.182]) by sourceware.org (Postfix) with ESMTP id 107373857C7A for ; Thu, 28 Jan 2021 21:59:59 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org 107373857C7A Received: from [192.168.254.32] (unknown [47.187.219.45]) by linux.microsoft.com (Postfix) with ESMTPSA id B2C5720B7192; Thu, 28 Jan 2021 13:59:56 -0800 (PST) DKIM-Filter: OpenDKIM Filter v2.11.0 linux.microsoft.com B2C5720B7192 Subject: Re: [RFC PATCH v3 2/5] x86: Support for Static Trampolines To: DJ Delorie Cc: libffi-discuss@sourceware.org References: From: "Madhavan T. Venkataraman" Message-ID: Date: Thu, 28 Jan 2021 15:59:55 -0600 User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Thunderbird/68.10.0 MIME-Version: 1.0 In-Reply-To: Content-Type: text/plain; charset=utf-8 Content-Language: en-US Content-Transfer-Encoding: 7bit X-Spam-Status: No, score=-26.9 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, ENV_AND_HDR_SPF_MATCH, GIT_PATCH_0, NICE_REPLY_A, SPF_HELO_PASS, SPF_PASS, TXREP, USER_IN_DEF_DKIM_WL, USER_IN_DEF_SPF_WL autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libffi-discuss@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libffi-discuss mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 28 Jan 2021 22:00:01 -0000 Thanks for reviewing this. My responses inline... On 1/26/21 9:31 PM, DJ Delorie wrote: > > madvenka@linux.microsoft.com writes: > >> diff --git a/src/x86/ffi64.c b/src/x86/ffi64.c >> index 39f9598..2a5cf5a 100644 >> --- a/src/x86/ffi64.c >> +++ b/src/x86/ffi64.c >> @@ -713,7 +713,9 @@ ffi_call_go (ffi_cif *cif, void (*fn)(void), void *rvalue, >> #endif /* FFI_GO_CLOSURES */ >> >> extern void ffi_closure_unix64(void) FFI_HIDDEN; >> +extern void ffi_closure_unix64_alt(void) FFI_HIDDEN; >> extern void ffi_closure_unix64_sse(void) FFI_HIDDEN; >> +extern void ffi_closure_unix64_sse_alt(void) FFI_HIDDEN; > > Extern, but local to this port, yes? > Yes. So, is this declaration acceptable? >> @@ -742,6 +744,7 @@ ffi_prep_closure_loc (ffi_closure* closure, >> 0x0f, 0x1f, 0x80, 0x00, 0x00, 0x00, 0x00 >> }; >> void (*dest)(void); >> + void (*dest_alt)(void); >> char *tramp = closure->tramp; > > Ok > >> @@ -752,13 +755,28 @@ ffi_prep_closure_loc (ffi_closure* closure, >> return FFI_BAD_ABI; >> >> if (cif->flags & UNIX64_FLAG_XMM_ARGS) >> - dest = ffi_closure_unix64_sse; >> + { >> + dest = ffi_closure_unix64_sse; >> + dest_alt = ffi_closure_unix64_sse_alt; >> + } >> else >> - dest = ffi_closure_unix64; >> + { >> + dest = ffi_closure_unix64; >> + dest_alt = ffi_closure_unix64_alt; >> + } >> >> + if (ffi_tramp_is_present(closure)) >> + { >> + /* Initialize the static trampoline's parameters. */ >> + ffi_tramp_set_parms (closure->ftramp, dest_alt, closure); >> + goto out; >> + } >> + >> + /* Initialize the dynamic trampoline. */ > > Should these new APIs be inside #if FFI_EXEC_STATIC_TRAMP ? > Strictly speaking, these should be inside that ifdef. I did it this way to avoid too many ifdefs in the code. If you prefer I put them inside the ifdefs, I will do it. I will try to minimize the number of ifdefs somehow. Please confirm. >> +#if defined(FFI_EXEC_STATIC_TRAMP) >> +void * >> +ffi_tramp_arch (size_t *tramp_size, size_t *map_size) >> +{ >> + extern void *trampoline_code_table_cet; >> + extern void *trampoline_code_table; >> + extern int ffi_cet_present; >> + >> + *map_size = UNIX64_TRAMP_MAP_SIZE; >> + if (ffi_cet_present) { >> + *tramp_size = UNIX64_TRAMP_SIZE_CET; >> + return &trampoline_code_table_cet; >> + } >> + *tramp_size = UNIX64_TRAMP_SIZE; >> + return &trampoline_code_table; >> +} >> +#endif > > Ok. > >> diff --git a/src/x86/ffiw64.c b/src/x86/ffiw64.c >> index a43a9eb..df81d66 100644 >> --- a/src/x86/ffiw64.c >> +++ b/src/x86/ffiw64.c >> @@ -187,6 +187,7 @@ EFI64(ffi_call_go)(ffi_cif *cif, void (*fn)(void), void *rvalue, >> >> >> extern void ffi_closure_win64(void) FFI_HIDDEN; >> +extern void ffi_closure_win64_alt(void) FFI_HIDDEN; >> >> #ifdef FFI_GO_CLOSURES >> extern void ffi_go_closure_win64(void) FFI_HIDDEN; >> @@ -220,9 +221,18 @@ EFI64(ffi_prep_closure_loc)(ffi_closure* closure, >> return FFI_BAD_ABI; >> } >> >> + if (ffi_tramp_is_present(closure)) >> + { >> + /* Initialize the static trampoline's parameters. */ >> + ffi_tramp_set_parms (closure->ftramp, ffi_closure_win64_alt, closure); >> + goto out; >> + } >> + >> + /* Initialize the dynamic trampoline. */ >> memcpy (tramp, trampoline, sizeof(trampoline)); >> *(UINT64 *)(tramp + sizeof (trampoline)) = (uintptr_t)ffi_closure_win64; >> >> +out: >> closure->cif = cif; >> closure->fun = fun; >> closure->user_data = user_data; > > Ok. > >> diff --git a/src/x86/internal64.h b/src/x86/internal64.h >> index 512e955..410bdf2 100644 >> --- a/src/x86/internal64.h >> +++ b/src/x86/internal64.h >> @@ -20,3 +20,14 @@ >> #define UNIX64_FLAG_RET_IN_MEM (1 << 10) >> #define UNIX64_FLAG_XMM_ARGS (1 << 11) >> #define UNIX64_SIZE_SHIFT 12 >> + >> +#if defined(FFI_EXEC_STATIC_TRAMP) >> +/* >> + * For the trampoline code table mapping, a mapping size of 4K (base page size) >> + * is chosen. >> + */ >> +#define UNIX64_TRAMP_MAP_SHIFT 12 >> +#define UNIX64_TRAMP_MAP_SIZE (1 << UNIX64_TRAMP_MAP_SHIFT) >> +#define UNIX64_TRAMP_SIZE_CET 40 >> +#define UNIX64_TRAMP_SIZE 32 >> +#endif > > Ok. > >> diff --git a/src/x86/unix64.S b/src/x86/unix64.S >> index 89d7db1..e26ea2c 100644 >> --- a/src/x86/unix64.S >> +++ b/src/x86/unix64.S >> @@ -63,6 +63,7 @@ >> C(ffi_call_unix64): >> L(UW0): >> _CET_ENDBR >> +L(endbr): > > This hack to detect CET should be replaced by the logic in ffitarget.h, > or add a #define CET_ENABLED to ffitarget.h > So, _CET_ENDBR for x64 is either defined as: If CET is present: #define _CET_ENDBR endbr64 Otherwise: #define _CET_ENDBR So, it is always defined. So, I cannot do something like: #ifdef _CET_ENDBR This will always be true. I did not find a standard, acceptable, compatible preprocessor way to test the actual value of a preprocessor symbol. Like _CET_ENDBR == endbr64. A couple of hacky solutions are mentioned in some posts when I googled the topic. There did not seem to be a standard way. The only way is for me to use information in cet.h directly. cet.h says this: # if defined __CET__ && (__CET__ & 1) != 0 # ifdef __x86_64__ # define _CET_ENDBR endbr64 # else # define _CET_ENDBR endbr32 # endif # else # define _CET_ENDBR # endif I wasn't sure if I am allowed to use (__CET__ & 1) != 0 in libffi. Is this internal to cet.h? If not, I can implement what you requested. Is this acceptable? >> @@ -270,6 +271,17 @@ L(UW6): >> L(UW7): >> ENDF(C(ffi_closure_unix64_sse)) >> >> + .balign 2 >> + .globl C(ffi_closure_unix64_sse_alt) >> + FFI_HIDDEN(C(ffi_closure_unix64_sse_alt)) >> + >> +C(ffi_closure_unix64_sse_alt): >> + _CET_ENDBR >> + movq 8(%rsp), %r10 >> + addq $16, %rsp > > Copies first argument to %r10, discards return address and arg - closure > will return to whoever called it's caller. I'm not sure how this works, > which means *at least* a comment needs to be here ;-) > Control is transferred to the alt entry points via the static trampoline. Here is the comment above the static trampoline code table definition about each individual trampoline in the table: /* * The trampoline uses register r10. It saves the original value of r10 on * the stack. * * The trampoline has two parameters - target code to jump to and data for * the target code. The trampoline extracts the parameters from its parameter * block (see tramp_table_map()). The trampoline saves the data address on * the stack. Finally, it jumps to the target code. * * The target code can choose to: * * - restore the value of r10 * - load the data address in a register * - restore the stack pointer to what it was when the trampoline was invoked. */ The target code in this case is the alt entry point. This is what the alt code is doing: - load the data (closure) address in r10 - discard the original value of r10 saved on the stack since we are using r10 we don't need its original value - restore the stack back to what it was when the static trampoline was invoked. So, the above prolog sets it up the way ffi_closure_unix64_sse expects to find it. I could add a small comment saying "see comment above trampoline_code_table". Is that acceptable? >> + jmp C(ffi_closure_unix64_sse) >> +ENDF(C(ffi_closure_unix64_sse_alt)) >> + >> .balign 2 >> .globl C(ffi_closure_unix64) >> FFI_HIDDEN(C(ffi_closure_unix64)) >> @@ -400,6 +412,17 @@ L(la): call PLT(C(abort)) >> L(UW11): >> ENDF(C(ffi_closure_unix64)) >> >> + .balign 8 >> + .globl C(ffi_closure_unix64_alt) >> + FFI_HIDDEN(C(ffi_closure_unix64_alt)) >> + >> +C(ffi_closure_unix64_alt): >> + _CET_ENDBR >> + movq 8(%rsp), %r10 >> + addq $16, %rsp >> + jmp C(ffi_closure_unix64) >> + ENDF(C(ffi_closure_unix64_alt)) >> + >> .balign 2 >> .globl C(ffi_go_closure_unix64_sse) >> FFI_HIDDEN(C(ffi_go_closure_unix64_sse)) > > Likewise. > See above. >> +/* >> + * The trampoline uses register r10. It saves the original value of r10 on >> + * the stack. >> + * >> + * The trampoline has two parameters - target code to jump to and data for >> + * the target code. The trampoline extracts the parameters from its parameter >> + * block (see tramp_table_map()). The trampoline saves the data address on >> + * the stack. Finally, it jumps to the target code. >> + * >> + * The target code can choose to: >> + * >> + * - restore the value of r10 >> + * - load the data address in a register >> + * - restore the stack pointer to what it was when the trampoline was invoked. >> + */ >> + >> + .align UNIX64_TRAMP_MAP_SIZE >> + .globl trampoline_code_table_cet >> + FFI_HIDDEN(C(trampoline_code_table_cet)) >> + >> +C(trampoline_code_table_cet): >> + .rept UNIX64_TRAMP_MAP_SIZE / UNIX64_TRAMP_SIZE_CET >> + _CET_ENDBR >> + subq $16, %rsp /* Make space on the stack */ >> + movq %r10, (%rsp) /* Save %r10 on stack */ >> + movq 4077(%rip), %r10 /* Copy data into %r10 */ >> + movq %r10, 8(%rsp) /* Save data on stack */ >> + movq 4073(%rip), %r10 /* Copy code into %r10 */ >> + jmp *%r10 /* Jump to code */ >> + nop >> + nop >> + nop >> + nop >> + nop >> + nop >> + .endr >> +ENDF(C(trampoline_code_table_cet)) >> + >> + .align UNIX64_TRAMP_MAP_SIZE >> + .globl trampoline_code_table >> + FFI_HIDDEN(C(trampoline_code_table)) >> + >> +C(trampoline_code_table): >> + .rept UNIX64_TRAMP_MAP_SIZE / UNIX64_TRAMP_SIZE >> + subq $16, %rsp /* Make space on the stack */ >> + movq %r10, (%rsp) /* Save %r10 on stack */ >> + movq 4081(%rip), %r10 /* Copy data into %r10 */ >> + movq %r10, 8(%rsp) /* Save data on stack */ >> + movq 4077(%rip), %r10 /* Copy code into %r10 */ >> + jmp *%r10 /* Jump to code */ >> + nop >> + nop >> + .endr >> +ENDF(C(trampoline_code_table)) >> + .align UNIX64_TRAMP_MAP_SIZE >> +#endif /* FFI_EXEC_STATIC_TRAMP */ > > Why does the longer trampoline (with endbr) have *more* nops? Is it for > 8-byte alignment? If so, comment ;-) > Yes. It is for 8-byte alignment. I will add the comment. >> @@ -615,6 +712,13 @@ L(EFDE5): >> .quad 0 >> #endif >> >> + .section .rodata >> + .align 8 >> + .globl ffi_cet_present >> +ffi_cet_present: >> + .set L6,L(endbr)-L(UW0) >> + .int L6 >> + > > Again, there are preprocessor directives that do this better. > See my explanation above and comment on whether the alternative is acceptable to you. >> diff --git a/src/x86/win64.S b/src/x86/win64.S >> index 8315e8b..6ca3068 100644 >> --- a/src/x86/win64.S >> +++ b/src/x86/win64.S >> @@ -234,6 +234,18 @@ C(ffi_closure_win64): >> >> cfi_endproc >> SEH(.seh_endproc) >> + >> + .align 8 >> + .globl C(ffi_closure_win64_alt) >> + FFI_HIDDEN(C(ffi_closure_win64_alt)) >> + >> + SEH(.seh_proc ffi_closure_win64_alt) >> +C(ffi_closure_win64_alt): >> + _CET_ENDBR >> + movq 8(%rsp), %r10 >> + addq $16, %rsp >> + jmp C(ffi_closure_win64) >> + SEH(.seh_endproc) >> #endif /* __x86_64__ */ > > Ok. > Again, thanks so much for the thorough review! Appreciate it! If I have missed anything, please let me know. Madhavan