Currently posix_spawn has to issue at least NSIG sigaction syscalls to obtain current signal disposition and more if there are not either SIG_IGN or SIG_DFL. The clone3 CLONE_CLEAR_SIGHAND reset all signal handlers of the child not set to SIG_IGN to SIG_DFL, thus allowing to skip the preparation phase. The exception is the signals defined by posix_spawnattr_setsigdefault when POSIX_SPAWN_SETSIGDEF is set, since they can be SIG_IGN. The patchset also adds clone3 implementation for aarch64, powerpc64, s390x, and riscv. Adhemerval Zanella (7): linux: Do set signal handler if it is already SIG_DFL linux: Add clone3 CLONE_CLEAR_SIGHAND optimization to posix_spawn powerpc64le: Add the clone3 wrapper aarch64: Add the clone3 wrapper s390x: Add the clone3 wrapper riscv: Add the clone3 wrapper Linux: optimize clone3 internal usage include/clone_internal.h | 10 + posix/Makefile | 3 +- posix/tst-spawn7.c | 177 ++++++++++++++++++ sysdeps/unix/sysv/linux/aarch64/clone3.S | 90 +++++++++ sysdeps/unix/sysv/linux/aarch64/sysdep.h | 2 + sysdeps/unix/sysv/linux/clone-internal.c | 61 ++++-- sysdeps/unix/sysv/linux/clone3.h | 6 + sysdeps/unix/sysv/linux/kernel-features.h | 8 + .../sysv/linux/powerpc/powerpc64/clone3.S | 145 ++++++++++++++ sysdeps/unix/sysv/linux/powerpc/sysdep.h | 1 + sysdeps/unix/sysv/linux/riscv/clone3.S | 83 ++++++++ sysdeps/unix/sysv/linux/riscv/sysdep.h | 1 + sysdeps/unix/sysv/linux/s390/s390-64/clone3.S | 84 +++++++++ sysdeps/unix/sysv/linux/s390/sysdep.h | 1 + sysdeps/unix/sysv/linux/spawni.c | 33 +++- 15 files changed, 679 insertions(+), 26 deletions(-) create mode 100644 posix/tst-spawn7.c create mode 100644 sysdeps/unix/sysv/linux/aarch64/clone3.S create mode 100644 sysdeps/unix/sysv/linux/powerpc/powerpc64/clone3.S create mode 100644 sysdeps/unix/sysv/linux/riscv/clone3.S create mode 100644 sysdeps/unix/sysv/linux/s390/s390-64/clone3.S -- 2.34.1
There is no need to issue another sigaction is the disposition is already SIG_DFL. Checked on x86_64-linux-gnu. --- sysdeps/unix/sysv/linux/spawni.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c index ee843a2247..65ee03c804 100644 --- a/sysdeps/unix/sysv/linux/spawni.c +++ b/sysdeps/unix/sysv/linux/spawni.c @@ -129,7 +129,7 @@ __spawni_child (void *arguments) else { __libc_sigaction (sig, 0, &sa); - if (sa.sa_handler == SIG_IGN) + if (sa.sa_handler == SIG_IGN || sa.sa_handler == SIG_DFL) continue; sa.sa_handler = SIG_DFL; } -- 2.34.1
The clone3 flag resets all signal handlers of the child not set to SIG_IGN to SIG_DFL. It allows to skip most of the sigaction calls to setup child signal handling, where previously a posix_spawn has to issue 2 times NSIG sigaction calls (one to obtain the current disposition and another to set either SIG_DFL or SIG_IGN). The expection is POSIX_SPAWN_SETSIGDEF the child still setup the signal for the case the disposition is SIG_IGN. It also need to handle the fallback where clone3 is not available, to set the fallback in child. This is done by splitting of __clone_internal_fallback from __clone_internal. Checked on x86_64-linux-gnu. --- include/clone_internal.h | 5 + posix/Makefile | 3 +- posix/tst-spawn7.c | 177 +++++++++++++++++++++++ sysdeps/unix/sysv/linux/clone-internal.c | 38 +++-- sysdeps/unix/sysv/linux/clone3.h | 6 + sysdeps/unix/sysv/linux/spawni.c | 31 ++-- 6 files changed, 235 insertions(+), 25 deletions(-) create mode 100644 posix/tst-spawn7.c diff --git a/include/clone_internal.h b/include/clone_internal.h index 4b23ef33ce..320640e64b 100644 --- a/include/clone_internal.h +++ b/include/clone_internal.h @@ -7,6 +7,11 @@ extern __typeof (clone3) __clone3; -1 with ENOSYS, fall back to clone or clone2. */ extern int __clone_internal (struct clone_args *__cl_args, int (*__func) (void *__arg), void *__arg); +/* The fallback code which calls clone/clone2 based on clone3 arguments. */ +extern int __clone_internal_fallback (struct clone_args *__cl_args, + int (*__func) (void *__arg), + void *__arg) + attribute_hidden; #ifndef _ISOMAC libc_hidden_proto (__clone3) diff --git a/posix/Makefile b/posix/Makefile index d1df7c27cb..7887eb1c8b 100644 --- a/posix/Makefile +++ b/posix/Makefile @@ -109,7 +109,7 @@ tests := test-errno tstgetopt testfnm runtests runptests \ tst-glob-tilde test-ssize-max tst-spawn4 bug-regex37 \ bug-regex38 tst-regcomp-truncated tst-spawn-chdir \ tst-wordexp-nocmd tst-execveat tst-spawn5 \ - tst-sched_getaffinity tst-spawn6 + tst-sched_getaffinity tst-spawn6 tst-spawn7 # Test for the glob symbol version that was replaced in glibc 2.27. ifeq ($(have-GLIBC_2.26)$(build-shared),yesyes) @@ -291,6 +291,7 @@ tst-spawn-ARGS = -- $(host-test-program-cmd) tst-spawn-static-ARGS = $(tst-spawn-ARGS) tst-spawn5-ARGS = -- $(host-test-program-cmd) tst-spawn6-ARGS = -- $(host-test-program-cmd) +tst-spawn7-ARGS = -- $(host-test-program-cmd) tst-dir-ARGS = `pwd` `cd $(common-objdir)/$(subdir); pwd` `cd $(common-objdir); pwd` $(objpfx)tst-dir tst-chmod-ARGS = $(objdir) tst-vfork3-ARGS = --test-dir=$(objpfx) diff --git a/posix/tst-spawn7.c b/posix/tst-spawn7.c new file mode 100644 index 0000000000..4b920bf08e --- /dev/null +++ b/posix/tst-spawn7.c @@ -0,0 +1,177 @@ +/* Tests for posix_spawn signal handling. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <http://www.gnu.org/licenses/>. */ + +#include <assert.h> +#include <getopt.h> +#include <spawn.h> +#include <stdlib.h> +#include <string.h> +#include <support/check.h> +#include <support/xsignal.h> +#include <support/xunistd.h> +#include <unistd.h> + +/* Nonzero if the program gets called via `exec'. */ +#define CMDLINE_OPTIONS \ + { "restart", no_argument, &restart, 1 }, +static int restart; + +/* Hold the four initial argument used to respawn the process, plus + the extra '--direct' and '--restart', the check type, and a final NULL. */ +static char *spargs[8]; +static int check_type_argc; + +/* Called on process re-execution. */ +_Noreturn static void +handle_restart (int argc, char *argv[]) +{ + assert (argc == 1); + + if (strcmp (argv[0], "SIG_DFL") == 0) + { + for (int i = 1; i < NSIG; i++) + { + struct sigaction sa; + int r = sigaction (i, NULL, &sa); + /* Skip internal signals (such as SIGCANCEL). */ + if (r == -1) + continue; + TEST_VERIFY_EXIT (sa.sa_handler == SIG_DFL); + } + } + else if (strcmp (argv[0], "SIG_IGN") == 0) + { + for (int i = 1; i < NSIG; i++) + { + struct sigaction sa; + int r = sigaction (i, NULL, &sa); + /* Skip internal signals (such as SIGCANCEL). */ + if (r == -1) + continue; + if (i == SIGUSR1 || i == SIGUSR2) + TEST_VERIFY_EXIT (sa.sa_handler == SIG_IGN); + else + TEST_VERIFY_EXIT (sa.sa_handler == SIG_DFL); + } + } + + exit (EXIT_SUCCESS); +} + +static void +spawn_signal_test (const char *type, const posix_spawnattr_t *attr) +{ + spargs[check_type_argc] = (char*) type; + + pid_t pid; + int status; + + TEST_COMPARE (posix_spawn (&pid, spargs[0], NULL, attr, spargs, environ), 0); + TEST_COMPARE (xwaitpid (pid, &status, 0), pid); + TEST_VERIFY (WIFEXITED (status)); + TEST_VERIFY (!WIFSIGNALED (status)); + TEST_COMPARE (WEXITSTATUS (status), 0); +} + +static void +dummy_sa_handler (int) +{ +} + +static void +do_test_signals (void) +{ + { + /* Check if all signals handler are set to SIG_DFL on spawned process. */ + spawn_signal_test ("SIG_DFL", NULL); + } + + { + /* Same as before, but set SIGUSR1 and SIGUSR2. */ + struct sigaction sa = { 0 }; + sa.sa_handler = dummy_sa_handler; + xsigaction (SIGUSR1, &sa, NULL); + xsigaction (SIGUSR2, &sa, NULL); + spawn_signal_test ("SIG_DFL", NULL); + } + + { + /* Check if SIG_IGN is keep as is. */ + struct sigaction sa = { 0 }; + sa.sa_handler = SIG_IGN; + xsigaction (SIGUSR1, &sa, NULL); + xsigaction (SIGUSR2, &sa, NULL); + spawn_signal_test ("SIG_IGN", NULL); + } + + { + /* Check if SIG_IGN handlers are set to SIG_DFL. */ + posix_spawnattr_t attr; + posix_spawnattr_init (&attr); + sigset_t mask; + sigemptyset (&mask); + sigaddset (&mask, SIGUSR1); + sigaddset (&mask, SIGUSR2); + posix_spawnattr_setsigdefault (&attr, &mask); + posix_spawnattr_setflags (&attr, POSIX_SPAWN_SETSIGDEF); + spawn_signal_test ("SIG_DFL", &attr); + posix_spawnattr_destroy (&attr); + } +} + +static int +do_test (int argc, char *argv[]) +{ + /* We must have either: + + - one or four parameters if called initially: + + argv[1]: path for ld.so optional + + argv[2]: "--library-path" optional + + argv[3]: the library path optional + + argv[4]: the application name + + - six parameters left if called through re-execution: + + argv[1]: the application name + + argv[2]: check SIG_IGN + + * When built with --enable-hardcoded-path-in-tests or issued without + using the loader directly. */ + + if (restart) + /* Ignore the application name. */ + handle_restart (argc - 1, &argv[1]); + + TEST_VERIFY_EXIT (argc == 2 || argc == 5); + + int i; + + for (i = 0; i < argc - 1; i++) + spargs[i] = argv[i + 1]; + spargs[i++] = (char *) "--direct"; + spargs[i++] = (char *) "--restart"; + check_type_argc = i++; + spargs[i] = NULL; + + + do_test_signals (); + + return 0; +} + +#define TEST_FUNCTION_ARGV do_test +#include <support/test-driver.c> diff --git a/sysdeps/unix/sysv/linux/clone-internal.c b/sysdeps/unix/sysv/linux/clone-internal.c index a71effcbd3..d9eb254ffb 100644 --- a/sysdeps/unix/sysv/linux/clone-internal.c +++ b/sysdeps/unix/sysv/linux/clone-internal.c @@ -44,27 +44,15 @@ _Static_assert (sizeof (struct clone_args) == CLONE_ARGS_SIZE_VER2, "sizeof (struct clone_args) != CLONE_ARGS_SIZE_VER2"); int -__clone_internal (struct clone_args *cl_args, - int (*func) (void *arg), void *arg) +__clone_internal_fallback (struct clone_args *cl_args, + int (*func) (void *arg), void *arg) { - int ret; -#ifdef HAVE_CLONE3_WRAPPER - /* Try clone3 first. */ - int saved_errno = errno; - ret = __clone3 (cl_args, sizeof (*cl_args), func, arg); - if (ret != -1 || errno != ENOSYS) - return ret; - - /* NB: Restore errno since errno may be checked against non-zero - return value. */ - __set_errno (saved_errno); -#endif - /* Map clone3 arguments to clone arguments. NB: No need to check invalid clone3 specific bits in flags nor exit_signal since this is an internal function. */ int flags = cl_args->flags | cl_args->exit_signal; void *stack = cast_to_pointer (cl_args->stack); + int ret; #ifdef __ia64__ ret = __clone2 (func, stack, cl_args->stack_size, @@ -88,4 +76,24 @@ __clone_internal (struct clone_args *cl_args, return ret; } + +int +__clone_internal (struct clone_args *cl_args, + int (*func) (void *arg), void *arg) +{ +#ifdef HAVE_CLONE3_WRAPPER + /* Try clone3 first. */ + int saved_errno = errno; + int ret = __clone3 (cl_args, sizeof (*cl_args), func, arg); + if (ret != -1 || errno != ENOSYS) + return ret; + + /* NB: Restore errno since errno may be checked against non-zero + return value. */ + __set_errno (saved_errno); +#endif + + return __clone_internal_fallback (cl_args, func, arg); +} + libc_hidden_def (__clone_internal) diff --git a/sysdeps/unix/sysv/linux/clone3.h b/sysdeps/unix/sysv/linux/clone3.h index 889014a6a9..a150363f89 100644 --- a/sysdeps/unix/sysv/linux/clone3.h +++ b/sysdeps/unix/sysv/linux/clone3.h @@ -25,6 +25,12 @@ __BEGIN_DECLS +/* Flags for the clone3 syscall. */ +#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and + reset to SIG_DFL. */ +#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given + the right permissions. */ + /* The unsigned 64-bit and 8-byte aligned integer type. */ typedef __U64_TYPE __aligned_uint64_t __attribute__ ((__aligned__ (8))); diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c index 65ee03c804..5251b4c602 100644 --- a/sysdeps/unix/sysv/linux/spawni.c +++ b/sysdeps/unix/sysv/linux/spawni.c @@ -66,6 +66,7 @@ struct posix_spawn_args ptrdiff_t argc; char *const *envp; int xflags; + bool use_clone3; int err; }; @@ -104,12 +105,11 @@ __spawni_child (void *arguments) const posix_spawnattr_t *restrict attr = args->attr; const posix_spawn_file_actions_t *file_actions = args->fa; - /* The child must ensure that no signal handler are enabled because it shared - memory with parent, so the signal disposition must be either SIG_DFL or - SIG_IGN. It does by iterating over all signals and although it could - possibly be more optimized (by tracking which signal potentially have a - signal handler), it might requires system specific solutions (since the - sigset_t data type can be very different on different architectures). */ + /* The child must ensure that no signal handler are enabled because it + shared memory with parent, so the signal disposition must be either + SIG_DFL or SIG_IGN. If clone3/CLONE_CLEAR_SIGHAND is used, there is + only need to set the defined signals to SIG_DFL if POSIX_SPAWN_SETSIGDEF + is used; otherwise, the code iterates over all signals. */ struct sigaction sa; memset (&sa, '\0', sizeof (sa)); @@ -122,7 +122,7 @@ __spawni_child (void *arguments) { sa.sa_handler = SIG_DFL; } - else if (__sigismember (&hset, sig)) + else if (!args->use_clone3 && __sigismember (&hset, sig)) { if (is_internal_signal (sig)) sa.sa_handler = SIG_IGN; @@ -382,12 +382,25 @@ __spawnix (pid_t * pid, const char *file, for instance). */ struct clone_args clone_args = { - .flags = CLONE_VM | CLONE_VFORK, + /* No supported flags like CLONE_CLEAR_SIGHAND will be cleared up by + __clone_internal_fallback. */ + .flags = CLONE_CLEAR_SIGHAND | CLONE_VM | CLONE_VFORK, .exit_signal = SIGCHLD, .stack = (uintptr_t) stack, .stack_size = stack_size, }; - new_pid = __clone_internal (&clone_args, __spawni_child, &args); +#ifdef HAVE_CLONE3_WRAPPER + args.use_clone3 = true; + new_pid = __clone3 (&clone_args, sizeof (clone_args), __spawni_child, + &args); + /* clone3 was added on 5.3 and CLONE_CLEAR_SIGHAND on 5.5. */ + if (new_pid == -1 && (errno == ENOSYS || errno == EINVAL)) +#endif + { + args.use_clone3 = false; + new_pid = __clone_internal_fallback (&clone_args, __spawni_child, + &args); + } /* It needs to collect the case where the auxiliary process was created but failed to execute the file (due either any preparation step or -- 2.34.1
It follows the internal signature: extern int clone3 (struct clone_args *__cl_args, size_t __size, int (*__func) (void *__arg), void *__arg); And x86_64 semantics to return EINVAL if either cl_args or func is NULL. The stack is 16-byte aligned prior executing func. Checked on powerpc64le-linux-gnu. --- .../sysv/linux/powerpc/powerpc64/clone3.S | 145 ++++++++++++++++++ sysdeps/unix/sysv/linux/powerpc/sysdep.h | 1 + 2 files changed, 146 insertions(+) create mode 100644 sysdeps/unix/sysv/linux/powerpc/powerpc64/clone3.S diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/clone3.S b/sysdeps/unix/sysv/linux/powerpc/powerpc64/clone3.S new file mode 100644 index 0000000000..a7461a3a42 --- /dev/null +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/clone3.S @@ -0,0 +1,145 @@ +/* The clone3 syscall wrapper. Linux/powerpc64 version. + Copyright (C) 2022 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#define _ERRNO_H 1 +#include <bits/errno.h> + +/* The userland implementation is: + int clone3 (struct clone_args *cl_args, size_t size, + int (*func)(void *arg), void *arg); + + the kernel entry is: + int clone3 (struct clone_args *cl_args, size_t size); + + The parameters are passed in registers from userland: + r3: cl_args + r4: size + r5: func + r6: arg */ + + .text +ENTRY(__clone3) + CALL_MCOUNT 4 + + /* Sanity checks args. */ + cmpdi cr0, r3, 0 + cmpdi cr1, r5, 0 + cror cr0*4+eq, cr1*4+eq, cr0*4+eq + beq cr0,L(badargs) + + /* Save some regs in the "red zone". */ +#ifdef USE_PPC_SCV + std r28, -24(r1) + cfi_offset (r28, -24) +#endif + std r29, -16(r1) + std r30, -8(r1) + cfi_offset (r29, -16) + cfi_offset (r30, -8) + + /* Save fn and args across syscall. */ + mr r30, r5 /* Function. */ + mr r29, r6 /* Arguments. */ + + /* End FDE now, because in the child the unwind info will be + wrong. */ + cfi_endproc + + /* Do the system call, the kernel expects: + r0: system call numer + r3: cl_args + r4: size */ + li r0, SYS_ify(clone3) +#ifdef USE_PPC_SCV + CHECK_SCV_SUPPORT r28 0f + /* This is equivalent to DO_CALL_SCV, but we cannot use the macro here + because it uses CFI directives and we just called cfi_endproc. */ + mflr r9 + std r9, FRAME_LR_SAVE(r1) + .machine "push" + .machine "power9" + scv 0 + .machine "pop" + ld r9, FRAME_LR_SAVE(r1) + mtlr r9 + + /* When using scv, error is indicated by negative r3. */ + cmpdi cr1, r3, 0 + b 1f +#endif +0: DO_CALL_SC + + /* With sc, error is indicated by cr0.SO. */ + cmpdi cr1, r3, 0 + crandc cr1*4+eq, cr1*4+eq, cr0*4+so + +1: bne- cr1,L(parent) + + /* Child, load the function and arguments. */ + + /* Align stack. */ + rldicr r1, r1, 0, 59 + + std r2, FRAME_TOC_SAVE(r1) + PPC64_LOAD_FUNCPTR r30 + mr r3, r29 + bctrl + ld r2, FRAME_TOC_SAVE(r1) + + li r0, SYS_ify(exit) + DO_CALL_SC + /* We won't ever get here but provide a nop so that the linker + will insert a toc adjusting stub if necessary. */ + nop + +L(badargs): + cfi_startproc + li r3, EINVAL + TAIL_CALL_SYSCALL_ERROR + +L(parent): + /* Check if svc is available. */ + cmpdi cr1, r28, 0 + + /* Parent. Restore registers & return. */ +#ifdef USE_PPC_SCV + cfi_offset (r28, -24) + ld r28, -24(r1) + cfi_restore (r28) +#endif + cfi_offset (r29,-16) + cfi_offset (r30,-8) + ld r29, -16(r1) + ld r30, -8(r1) + cfi_restore (r29) + cfi_restore (r30) + +#ifdef USE_PPC_SCV + beq cr1, 0f + RET_SCV + b 1f +#endif +0: RET_SC +1: TAIL_CALL_SYSCALL_ERROR + +PSEUDO_END (__clone3) + +libc_hidden_def (__clone3) +weak_alias (__clone3, clone3) diff --git a/sysdeps/unix/sysv/linux/powerpc/sysdep.h b/sysdeps/unix/sysv/linux/powerpc/sysdep.h index 4fb135aa8d..b86951207d 100644 --- a/sysdeps/unix/sysv/linux/powerpc/sysdep.h +++ b/sysdeps/unix/sysv/linux/powerpc/sysdep.h @@ -246,6 +246,7 @@ #if defined(__PPC64__) || defined(__powerpc64__) #define HAVE_CLOCK_GETRES64_VSYSCALL "__kernel_clock_getres" #define HAVE_CLOCK_GETTIME64_VSYSCALL "__kernel_clock_gettime" +#define HAVE_CLONE3_WRAPPER 1 #else #define HAVE_CLOCK_GETRES_VSYSCALL "__kernel_clock_getres" #define HAVE_CLOCK_GETTIME_VSYSCALL "__kernel_clock_gettime" -- 2.34.1
It follow the internal signature: extern int clone3 (struct clone_args *__cl_args, size_t __size, int (*__func) (void *__arg), void *__arg); And x86_64 semantics to return EINVAL if either cl_args or func is NULL. The stack is 16-byte aligned prior executing func. Checked on aarch64-linux-gnu. --- sysdeps/unix/sysv/linux/aarch64/clone3.S | 90 ++++++++++++++++++++++++ sysdeps/unix/sysv/linux/aarch64/sysdep.h | 2 + 2 files changed, 92 insertions(+) create mode 100644 sysdeps/unix/sysv/linux/aarch64/clone3.S diff --git a/sysdeps/unix/sysv/linux/aarch64/clone3.S b/sysdeps/unix/sysv/linux/aarch64/clone3.S new file mode 100644 index 0000000000..dba93430eb --- /dev/null +++ b/sysdeps/unix/sysv/linux/aarch64/clone3.S @@ -0,0 +1,90 @@ +/* The clone3 syscall wrapper. Linux/aarch64 version. + Copyright (C) 2022 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#define _ERRNO_H 1 +#include <bits/errno.h> + +/* The userland implementation is: + int clone3 (struct clone_args *cl_args, size_t size, + int (*func)(void *arg), void *arg); + + the kernel entry is: + int clone3 (struct clone_args *cl_args, size_t size); + + The parameters are passed in registers from userland: + x0: cl_args + x1: size + x2: func + x3: arg */ + + .text +ENTRY(__clone3) + PTR_ARG (0) + PTR_ARG (1) + PTR_ARG (3) + PTR_ARG (4) + /* Save args for the child. */ + mov x10, x0 /* cl_args */ + mov x11, x2 /* func */ + mov x12, x3 /* args */ + + /* Sanity check args. */ + mov x0, #-EINVAL + cbz x10, .Lsyscall_error /* No NULL cl_args pointer. */ + cbz x2, .Lsyscall_error /* No NULL function pointer. */ + + /* Do the system call, the kernel expects: + x8: system call number + x0: cl_args + x1: size */ + mov x0, x10 + mov x8, #SYS_ify(clone3) + svc 0x0 + + cmp x0, #0 + beq thread_start + blt .Lsyscall_error + RET +PSEUDO_END (__clone3) + + .align 4 + .type thread_start, %function +thread_start: + cfi_startproc + cfi_undefined (x30) + mov x29, 0 + + /* Align sp. */ + mov x0, sp + and x0, x0, -16 + mov sp, x0 + + /* Pick the function arg and execute. */ + mov x0, x12 + blr x11 + + /* We are done, pass the return value through x0. */ + mov x8, #SYS_ify(exit) + svc 0x0 + cfi_endproc + .size thread_start, .-thread_start + +libc_hidden_def (__clone3) +weak_alias (__clone3, clone3) diff --git a/sysdeps/unix/sysv/linux/aarch64/sysdep.h b/sysdeps/unix/sysv/linux/aarch64/sysdep.h index f1853e012f..42bb22f5e6 100644 --- a/sysdeps/unix/sysv/linux/aarch64/sysdep.h +++ b/sysdeps/unix/sysv/linux/aarch64/sysdep.h @@ -164,6 +164,8 @@ # define HAVE_CLOCK_GETTIME64_VSYSCALL "__kernel_clock_gettime" # define HAVE_GETTIMEOFDAY_VSYSCALL "__kernel_gettimeofday" +# define HAVE_CLONE3_WRAPPER 1 + # undef INTERNAL_SYSCALL_RAW # define INTERNAL_SYSCALL_RAW(name, nr, args...) \ ({ long _sys_result; \ -- 2.34.1
It follows the internal signature: extern int clone3 (struct clone_args *__cl_args, size_t __size, int (*__func) (void *__arg), void *__arg); And x86_64 semantics to return EINVAL if either cl_args or func is NULL. The stack is 16-byte aligned prior executing func. Checked on s390x-linux-gnu. --- sysdeps/unix/sysv/linux/s390/s390-64/clone3.S | 84 +++++++++++++++++++ sysdeps/unix/sysv/linux/s390/sysdep.h | 1 + 2 files changed, 85 insertions(+) create mode 100644 sysdeps/unix/sysv/linux/s390/s390-64/clone3.S diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/clone3.S b/sysdeps/unix/sysv/linux/s390/s390-64/clone3.S new file mode 100644 index 0000000000..d338acfc9c --- /dev/null +++ b/sysdeps/unix/sysv/linux/s390/s390-64/clone3.S @@ -0,0 +1,84 @@ +/* The clone3 syscall wrapper. Linux/s390x version. + Copyright (C) 2022 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <sysdep.h> +#define _ERRNO_H 1 +#include <bits/errno.h> + +/* The userland implementation is: + int clone3 (struct clone_args *cl_args, size_t size, + int (*func)(void *arg), void *arg); + + the kernel entry is: + int clone3 (struct clone_args *cl_args, size_t size); + + The parameters are passed in registers from userland: + r2: cl_args + r3: size + r4: func + r5: arg */ + + .text +ENTRY(__clone3) + stg %r6, 48(%r15) + + /* Sanity check args. */ + ltgr %r2, %r2 + je error + ltgr %r6, %r4 + je error + + /* Do the system call, the kernel expects: + r1: system call number + r2: cl_args + r3: size */ + lghi %r1, SYS_ify(clone3) + svc 0 + ltgr %r2,%r2 /* check return code */ + jz thread_start + lg %r6, 48(%r15) + jgm SYSCALL_ERROR_LABEL + br %r14 +error: + lghi %r2,-EINVAL + jg SYSCALL_ERROR_LABEL +PSEUDO_END (__clone3) + + .align 4 + .type thread_start, %function +thread_start: + cfi_startproc + /* Mark r14 as undefined in order to stop unwinding here. */ + cfi_undefined (r14) + + /* Align stack. */ + nill %r15, 0xfff0 + + /* func is in gpr 6, arg in gpr 5. */ + lgr %r2, %r5 + aghi %r15, -160 + xc 0(8,%r15),0(%r15) + basr %r14, %r6 + + DO_CALL (exit, 1) + cfi_endproc + .size thread_start, .-thread_start + +libc_hidden_def (__clone3) +weak_alias (__clone3, clone3) diff --git a/sysdeps/unix/sysv/linux/s390/sysdep.h b/sysdeps/unix/sysv/linux/s390/sysdep.h index 930d7efe03..366c797ef1 100644 --- a/sysdeps/unix/sysv/linux/s390/sysdep.h +++ b/sysdeps/unix/sysv/linux/s390/sysdep.h @@ -72,6 +72,7 @@ #ifdef __s390x__ #define HAVE_CLOCK_GETRES64_VSYSCALL "__kernel_clock_getres" #define HAVE_CLOCK_GETTIME64_VSYSCALL "__kernel_clock_gettime" +#define HAVE_CLONE3_WRAPPER 1 #else #define HAVE_CLOCK_GETRES_VSYSCALL "__kernel_clock_getres" #define HAVE_CLOCK_GETTIME_VSYSCALL "__kernel_clock_gettime" -- 2.34.1
It follows the internal signature: extern int clone3 (struct clone_args *__cl_args, size_t __size, int (*__func) (void *__arg), void *__arg); And x86_64 semantics to return EINVAL if either cl_args or func is NULL. The stack is 16-byte aligned prior executing func. Checked on riscv64-linux-gnu-rv64imafdc-lp64d. --- sysdeps/unix/sysv/linux/riscv/clone3.S | 83 ++++++++++++++++++++++++++ sysdeps/unix/sysv/linux/riscv/sysdep.h | 1 + 2 files changed, 84 insertions(+) create mode 100644 sysdeps/unix/sysv/linux/riscv/clone3.S diff --git a/sysdeps/unix/sysv/linux/riscv/clone3.S b/sysdeps/unix/sysv/linux/riscv/clone3.S new file mode 100644 index 0000000000..1e6e807aa0 --- /dev/null +++ b/sysdeps/unix/sysv/linux/riscv/clone3.S @@ -0,0 +1,83 @@ +/* The clone3 syscall wrapper. Linux/RISC-V version. + Copyright (C) 2022 Free Software Foundation, Inc. + + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#include <asm/errno.h> +#include <sys/asm.h> +#include <sysdep.h> + +/* The userland implementation is: + int clone3 (struct clone_args *cl_args, size_t size, + int (*func)(void *arg), void *arg); + + the kernel entry is: + int clone3 (struct clone_args *cl_args, size_t size); + + The parameters are passed in registers from userland: + a0: cl_args + a1: size + a2: func + a3: arg */ + + .text +ENTRY(__clone3) + /* Sanity check args. */ + beqz a0, L(invalid) /* No NULL cl_args pointer. */ + beqz a2, L(invalid) /* No NULL function pointer. */ + + /* Do the system call, the kernel expects: + a7: system call number + a0: cl_args + a1: size */ + li a7, __NR_clone3 + scall + + bltz a0, L(error) + beqz a0, L(thread_start) + + ret + +L(invalid): + li a0, -EINVAL +L(error): + tail __syscall_error +END (__clone3) + +ENTRY(__thread_start_clone3) +L(thread_start): + /* Terminate call stack by noting ra is undefined. Use a dummy + .cfi_label to force starting the FDE. */ + .cfi_label .Ldummy + cfi_undefined (ra) + + /* Align stack to a 128-bit boundary as per RISC-V ABI. */ + andi sp, sp, ALMASK + + /* Restore the arg for user's function and call the user's + function. */ + mv a1, a2 /* Function pointer. */ + mv a0, a3 /* Argument pointer. */ + jalr a1 + + /* Call exit with the function's return value. */ + li a7, __NR_exit + scall +END(__thread_start_clone3) + +libc_hidden_def (__clone3) +weak_alias (__clone3, clone3) diff --git a/sysdeps/unix/sysv/linux/riscv/sysdep.h b/sysdeps/unix/sysv/linux/riscv/sysdep.h index 9b03b10567..57523dc7b7 100644 --- a/sysdeps/unix/sysv/linux/riscv/sysdep.h +++ b/sysdeps/unix/sysv/linux/riscv/sysdep.h @@ -150,6 +150,7 @@ /* RV32 does not support the gettime VDSO syscalls. */ # endif +# define HAVE_CLONE3_WRAPPER 1 /* List of system calls which are supported as vsyscalls (for RV32 and RV64). */ -- 2.34.1
Now that clone3 is used on more architectures, add an optimization to avoid calling when glibc detects that it is no supported by the kernel. It also adds __ASSUME_CLONE3, which allows skip this optimization and issue clone3 syscall directly. It does not handle the the small window between 5.3 and 5.5 for posix_spawn (CLONE_CLEAR_SIGHAND was added in 5.5). Checked on x86_64-linux-gnu. --- include/clone_internal.h | 5 +++++ sysdeps/unix/sysv/linux/clone-internal.c | 25 ++++++++++++++++++++++- sysdeps/unix/sysv/linux/kernel-features.h | 8 ++++++++ 3 files changed, 37 insertions(+), 1 deletion(-) diff --git a/include/clone_internal.h b/include/clone_internal.h index 320640e64b..71c52211c7 100644 --- a/include/clone_internal.h +++ b/include/clone_internal.h @@ -7,6 +7,11 @@ extern __typeof (clone3) __clone3; -1 with ENOSYS, fall back to clone or clone2. */ extern int __clone_internal (struct clone_args *__cl_args, int (*__func) (void *__arg), void *__arg); +/* clone3 wrapper that avoid issuing the syscall once clone3 fails with + ENOSYS. */ +extern int __clone3_internal (struct clone_args *cl_args, + int (*func) (void *args), void *arg) + attribute_hidden; /* The fallback code which calls clone/clone2 based on clone3 arguments. */ extern int __clone_internal_fallback (struct clone_args *__cl_args, int (*__func) (void *__arg), diff --git a/sysdeps/unix/sysv/linux/clone-internal.c b/sysdeps/unix/sysv/linux/clone-internal.c index d9eb254ffb..7e20d02025 100644 --- a/sysdeps/unix/sysv/linux/clone-internal.c +++ b/sysdeps/unix/sysv/linux/clone-internal.c @@ -76,6 +76,29 @@ __clone_internal_fallback (struct clone_args *cl_args, return ret; } +int +__clone3_internal (struct clone_args *cl_args, int (*func) (void *args), + void *arg) +{ +#ifdef HAVE_CLONE3_WRAPPER +# if __ASSUME_CLONE3 + return __clone3 (cl_args, sizeof (*cl_args), func, arg); +# else + static int clone3_supported = 1; + if (atomic_load_relaxed (&clone3_supported) == 1) + { + int ret = __clone3 (cl_args, sizeof (*cl_args), func, arg); + if (ret != -1 || errno != ENOSYS) + return ret; + + atomic_store_relaxed (&clone3_supported, 0); + } +# endif +#endif + __set_errno (ENOSYS); + return -1; +} + int __clone_internal (struct clone_args *cl_args, @@ -84,7 +107,7 @@ __clone_internal (struct clone_args *cl_args, #ifdef HAVE_CLONE3_WRAPPER /* Try clone3 first. */ int saved_errno = errno; - int ret = __clone3 (cl_args, sizeof (*cl_args), func, arg); + int ret = __clone3_internal (cl_args, func, arg); if (ret != -1 || errno != ENOSYS) return ret; diff --git a/sysdeps/unix/sysv/linux/kernel-features.h b/sysdeps/unix/sysv/linux/kernel-features.h index 74adc3956b..496c7f34cb 100644 --- a/sysdeps/unix/sysv/linux/kernel-features.h +++ b/sysdeps/unix/sysv/linux/kernel-features.h @@ -236,4 +236,12 @@ # define __ASSUME_FUTEX_LOCK_PI2 0 #endif +/* The clone3 system call was introduced across all architectures + in Linux 5.3. */ +#if __LINUX_KERNEL_VERSION >= 0x050300 +# define __ASSUME_CLONE3 1 +#else +# define __ASSUME_CLONE3 0 +#endif + #endif /* kernel-features.h */ -- 2.34.1