public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] linux: Fallback to fork and exec for clone fail on posix_spawn (BZ#29115)
@ 2022-05-02 18:06 Adhemerval Zanella
  2022-05-02 19:15 ` Alexey Izbyshev
  0 siblings, 1 reply; 13+ messages in thread
From: Adhemerval Zanella @ 2022-05-02 18:06 UTC (permalink / raw)
  To: libc-alpha, Alexey Izbyshev

Linux clone with CLONE_VM | CLONE_VFORK may fail for some namespace
restriction (for instance kernel does not allow processes in different
time namespaces to share the sameaddress space).

In this case clone fails with EINVAL and thus posix_spawn can not spawn
a new process.  However the same process can be spawned with fork and exec.

The patch fixes by issues _Fork and exec iff clone fails with a non
transient failure (ENOMEM and EAGAIN still returns failure to caller).

Failure on prepare phase in helper process does not trigger the fork
and exec fallback.

Checked on x86_64-linux-gnu.
---
 sysdeps/unix/sysv/linux/spawni.c | 175 ++++++++++++++++++++-----------
 1 file changed, 116 insertions(+), 59 deletions(-)

diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
index d6f5ca89cd..b73715da7e 100644
--- a/sysdeps/unix/sysv/linux/spawni.c
+++ b/sysdeps/unix/sysv/linux/spawni.c
@@ -67,6 +67,7 @@ struct posix_spawn_args
   char *const *envp;
   int xflags;
   int err;
+  int pipe[2];
 };
 
 /* Older version requires that shell script without shebang definition
@@ -103,6 +104,7 @@ __spawni_child (void *arguments)
   struct posix_spawn_args *args = arguments;
   const posix_spawnattr_t *restrict attr = args->attr;
   const posix_spawn_file_actions_t *file_actions = args->fa;
+  bool use_pipe = args->pipe[0] != -1;
 
   /* The child must ensure that no signal handler are enabled because it shared
      memory with parent, so the signal disposition must be either SIG_DFL or
@@ -113,6 +115,9 @@ __spawni_child (void *arguments)
   struct sigaction sa;
   memset (&sa, '\0', sizeof (sa));
 
+  if (use_pipe)
+    __close (args->pipe[0]);
+
   sigset_t hset;
   __sigprocmask (SIG_BLOCK, 0, &hset);
   for (int sig = 1; sig < _NSIG; ++sig)
@@ -300,45 +305,26 @@ fail:
      (EINTERNALBUG) describing that, use ECHILD.  Another option would
      be to set args->err to some negative sentinel and have the parent
      abort(), but that seems needlessly harsh.  */
-  args->err = errno ? : ECHILD;
+  int ret = errno ? : ECHILD;
+  if (use_pipe)
+    {
+      __close (args->pipe[0]);
+      while (__write_nocancel (args->pipe[1], &ret, sizeof (ret)) < 0);
+    }
+  else
+    args->err = ret;
+
   _exit (SPAWN_ERROR);
 }
 
-/* Spawn a new process executing PATH with the attributes describes in *ATTRP.
-   Before running the process perform the actions described in FILE-ACTIONS. */
-static int
-__spawnix (pid_t * pid, const char *file,
-	   const posix_spawn_file_actions_t * file_actions,
-	   const posix_spawnattr_t * attrp, char *const argv[],
-	   char *const envp[], int xflags,
-	   int (*exec) (const char *, char *const *, char *const *))
+/* Create a new process using clone with CLONE_VM | CLONE_VFORK to optimize
+   memory usage.  Return TRUE is the helper process could be spawned, or
+   FALSE otherwise.  */
+static bool
+spawni_clone (struct posix_spawn_args *args, pid_t *new_pid, int *ec)
 {
-  pid_t new_pid;
-  struct posix_spawn_args args;
-  int ec;
-
-  /* To avoid imposing hard limits on posix_spawn{p} the total number of
-     arguments is first calculated to allocate a mmap to hold all possible
-     values.  */
-  ptrdiff_t argc = 0;
-  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
-     to be used in a execve call.  We limit to INT_MAX minus one due the
-     compatiblity code that may execute a shell script (maybe_script_execute)
-     where it will construct another argument list with an additional
-     argument.  */
-  ptrdiff_t limit = INT_MAX - 1;
-  while (argv[argc++] != NULL)
-    if (argc == limit)
-      {
-	errno = E2BIG;
-	return errno;
-      }
-
-  int prot = (PROT_READ | PROT_WRITE
-	     | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
-
   /* Add a slack area for child's stack.  */
-  size_t argv_size = (argc * sizeof (void *)) + 512;
+  size_t argv_size = (args->argc * sizeof (void *)) + 512;
   /* We need at least a few pages in case the compiler's stack checking is
      enabled.  In some configs, it is known to use at least 24KiB.  We use
      32KiB to be "safe" from anything the compiler might do.  Besides, the
@@ -347,28 +333,12 @@ __spawnix (pid_t * pid, const char *file,
      where it might use about 1k extra stack space).  */
   argv_size += (32 * 1024);
   size_t stack_size = ALIGN_UP (argv_size, GLRO(dl_pagesize));
+  int prot = (PROT_READ | PROT_WRITE
+	     | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
   void *stack = __mmap (NULL, stack_size, prot,
 			MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
   if (__glibc_unlikely (stack == MAP_FAILED))
-    return errno;
-
-  /* Disable asynchronous cancellation.  */
-  int state;
-  __pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, &state);
-
-  /* Child must set args.err to something non-negative - we rely on
-     the parent and child sharing VM.  */
-  args.err = 0;
-  args.file = file;
-  args.exec = exec;
-  args.fa = file_actions;
-  args.attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 };
-  args.argv = argv;
-  args.argc = argc;
-  args.envp = envp;
-  args.xflags = xflags;
-
-  __libc_signal_block_all (&args.oldmask);
+    return true;
 
   /* The clone flags used will create a new child that will run in the same
      memory space (CLONE_VM) and the execution of calling thread will be
@@ -385,12 +355,12 @@ __spawnix (pid_t * pid, const char *file,
       .stack = (uintptr_t) stack,
       .stack_size = stack_size,
     };
-  new_pid = __clone_internal (&clone_args, __spawni_child, &args);
+  *new_pid = __clone_internal (&clone_args, __spawni_child, args);
 
   /* It needs to collect the case where the auxiliary process was created
      but failed to execute the file (due either any preparation step or
      for execve itself).  */
-  if (new_pid > 0)
+  if (*new_pid > 0)
     {
       /* Also, it handles the unlikely case where the auxiliary process was
 	 terminated before calling execve as if it was successfully.  The
@@ -398,21 +368,108 @@ __spawnix (pid_t * pid, const char *file,
 	 only in case of failure, so in case of premature termination
 	 due a signal args.err will remain zeroed and it will be up to
 	 caller to actually collect it.  */
-      ec = args.err;
-      if (ec > 0)
+      *ec = args->err;
+      if (*ec > 0)
 	/* There still an unlikely case where the child is cancelled after
 	   setting args.err, due to a positive error value.  Also there is
 	   possible pid reuse race (where the kernel allocated the same pid
 	   to an unrelated process).  Unfortunately due synchronization
 	   issues where the kernel might not have the process collected
 	   the waitpid below can not use WNOHANG.  */
-	__waitpid (new_pid, NULL, 0);
+	__waitpid (*new_pid, NULL, 0);
     }
   else
-    ec = errno;
+    *ec = errno;
 
   __munmap (stack, stack_size);
 
+  /* There is no much point in retrying with fork and exec if kernel returns a
+     failure due resource exhaustion.  */
+  return *new_pid > 0 || (errno != ENOMEM && errno != EAGAIN);
+}
+
+/* Fallback spawn case which uses fork plus exec along with a pipe to
+   communicate any helper process prepare pahse or execve failure.  */
+static void
+spawni_fork_fallback (struct posix_spawn_args *args, pid_t *new_pid, int *ec)
+{
+  if (__pipe2 (args->pipe, O_CLOEXEC))
+    return;
+
+  /* Do not trigger atfork handler nor any internal state reset since the
+     helper process will call execve.  */
+  *new_pid = _Fork ();
+  if (*new_pid == 0)
+    __spawni_child (args);
+  else if (*new_pid > 0)
+    {
+      __close (args->pipe[1]);
+      if (__read (args->pipe[0], ec, sizeof *ec) != sizeof *ec)
+	*ec = 0;
+      else
+	__waitpid (*new_pid, NULL, 0);
+    }
+  else
+    *ec = errno;
+
+  __close (args->pipe[0]);
+}
+
+/* Spawn a new process executing PATH with the attributes describes in *ATTRP.
+   Before running the process perform the actions described in FILE-ACTIONS. */
+static int
+__spawnix (pid_t * pid, const char *file,
+	   const posix_spawn_file_actions_t * file_actions,
+	   const posix_spawnattr_t * attrp, char *const argv[],
+	   char *const envp[], int xflags,
+	   int (*exec) (const char *, char *const *, char *const *))
+{
+  /* To avoid imposing hard limits on posix_spawn{p} the total number of
+     arguments is first calculated to allocate a mmap to hold all possible
+     values.  */
+  ptrdiff_t argc = 0;
+  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
+     to be used in a execve call.  We limit to INT_MAX minus one due the
+     compatiblity code that may execute a shell script (maybe_script_execute)
+     where it will construct another argument list with an additional
+     argument.  */
+  ptrdiff_t limit = INT_MAX - 1;
+  while (argv[argc++] != NULL)
+    if (argc == limit)
+      {
+	errno = E2BIG;
+	return errno;
+      }
+
+  /* Disable asynchronous cancellation.  */
+  int state;
+  __pthread_setcancelstate (PTHREAD_CANCEL_DISABLE, &state);
+
+  /* Child must set args.err to something non-negative - we rely on
+     the parent and child sharing VM.  */
+  struct posix_spawn_args args;
+  args.err = 0;
+  args.file = file;
+  args.exec = exec;
+  args.fa = file_actions;
+  args.attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 };
+  args.argv = argv;
+  args.argc = argc;
+  args.envp = envp;
+  args.xflags = xflags;
+  args.pipe[0] = args.pipe[1] = -1;
+
+  __libc_signal_block_all (&args.oldmask);
+
+  pid_t new_pid = 0;
+  int ec = -1;
+  /* clone with CLONE_VM | CLONE_VFORK may fail for some namespace restriction
+     (for instance Linux does not allow processes in different time namespaces
+     to share address space) and in this case clone fails with EINVAL.  Retry
+     with fork and exec.  */
+  if (!spawni_clone (&args, &new_pid, &ec))
+    spawni_fork_fallback (&args, &new_pid, &ec);
+
   if ((ec == 0) && (pid != NULL))
     *pid = new_pid;
 
-- 
2.34.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-05-02 20:09 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-02 18:06 [PATCH] linux: Fallback to fork and exec for clone fail on posix_spawn (BZ#29115) Adhemerval Zanella
2022-05-02 19:15 ` Alexey Izbyshev
2022-05-02 19:30   ` Florian Weimer
2022-05-02 19:37     ` Alexey Izbyshev
2022-05-02 20:05       ` Florian Weimer
2022-05-02 19:42   ` Adhemerval Zanella
2022-05-02 19:54     ` Alexey Izbyshev
2022-05-02 20:02       ` Adhemerval Zanella
2022-05-02 20:08         ` Alexey Izbyshev
2022-05-02 20:09           ` Adhemerval Zanella
2022-05-02 19:57     ` Adhemerval Zanella
2022-05-02 20:02       ` Alexey Izbyshev
2022-05-02 20:04         ` Adhemerval Zanella

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).