public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-07-31 18:31 [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback Adhemerval Zanella
@ 2019-07-31 18:31 ` Adhemerval Zanella
  2019-08-28 14:09   ` Adhemerval Zanella
  2019-08-29  8:38   ` Florian Weimer
  2019-07-31 18:31 ` [PATCH v2 3/5] posix: Optimize stack Linux posix_spawn Adhemerval Zanella
                   ` (6 subsequent siblings)
  7 siblings, 2 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-07-31 18:31 UTC (permalink / raw)
  To: libc-alpha

The child helper process on Linux posix_spawn child must ensure that no signal
handler are enabled, so the signal disposition must be either SIG_DFL or
SIG_IGN.  However, it requires a sigprocmask to obtain the current signal mask
and at least _NSIG sigaction calls to reset the signal handlers for each
posix_spawn call.

This patch optimizes it by tracking on sigaction implementation when a signal
action is set different than SIG_DFL or SIG_IGN.  It allows remove a
sigprocmask call and isse sigaction to reset the disposition only the signals
that has non-default actions set.

It might incur in false positive, since it not easy to remove bits from the
mask without race conditions, but it does not allow false negative since the
mask is updated atomically prior the syscall.  The false positive incur in
just extra sigactions on posix_spawn.

Checked on x86_64 and i686.

	* include/atomic.h (atomic_fetch_or_seq_cst, atomic_fetch_or_seq_cst):
	New macros.
	* posix/Makefile (tests): Add tst-spawn6.
	* posix/tst-spawn6.c: New file.
	* sysdeps/generic/sigsetops.h (__sigorset_atomic): New macro.
	* sysdeps/unix/sysv/linux/internal-signals.h (__get_sighandler_set):
	New prototype.
	* sysdeps/unix/sysv/linux/sigaction.c (__get_sighandler_set): New
	function.
	(__libc_sigaction): Set the internal handler_set for a new action.
	* sysdeps/unix/sysv/linux/sigsetops.h (__sigorset_atomic,
	__sigaddset_atomic): New macros.
	* sysdeps/unix/sysv/linux/spawni.c (spawni_child): Replace
	__sigprocmask with __get_sighandler_set.
---
 include/atomic.h                           |  10 +
 posix/Makefile                             |   4 +-
 posix/tst-spawn6.c                         | 220 +++++++++++++++++++++
 sysdeps/generic/sigsetops.h                |   7 +
 sysdeps/unix/sysv/linux/internal-signals.h |   3 +
 sysdeps/unix/sysv/linux/sigaction.c        |  17 ++
 sysdeps/unix/sysv/linux/sigsetops.h        |  15 ++
 sysdeps/unix/sysv/linux/spawni.c           |   9 +-
 8 files changed, 278 insertions(+), 7 deletions(-)
 create mode 100644 posix/tst-spawn6.c

diff --git a/include/atomic.h b/include/atomic.h
index ee1978eb3b..72609efde9 100644
--- a/include/atomic.h
+++ b/include/atomic.h
@@ -646,6 +646,9 @@ void __atomic_link_error (void);
 # define atomic_fetch_or_release(mem, operand) \
   ({ __atomic_check_size((mem));					      \
   __atomic_fetch_or ((mem), (operand), __ATOMIC_RELEASE); })
+# define atomic_fetch_or_seq_cst(mem, operand) \
+  ({ __atomic_check_size((mem));					      \
+  __atomic_fetch_or ((mem), (operand), __ATOMIC_SEQ_CST); })
 
 # define atomic_fetch_xor_release(mem, operand) \
   ({ __atomic_check_size((mem));					      \
@@ -791,6 +794,13 @@ void __atomic_link_error (void);
    ({ atomic_thread_fence_release ();					      \
    atomic_fetch_or_acquire ((mem), (operand)); })
 # endif
+# ifndef atomic_fetch_or_seq_cst
+#  define atomic_fetch_or_seq_cst(mem, operand) \
+   ({ atomic_thread_fence_acquire ();					      \
+   atomic_fetch_or_relaxed ((mem), (operand));				      \
+   atomic_thread_fence_release (); })
+# endif
+
 
 # ifndef atomic_fetch_xor_release
 /* Failing the atomic_compare_exchange_weak_release reloads the value in
diff --git a/posix/Makefile b/posix/Makefile
index 1ac41ad85a..131ae052fd 100644
--- a/posix/Makefile
+++ b/posix/Makefile
@@ -102,7 +102,8 @@ tests		:= test-errno tstgetopt testfnm runtests runptests \
 		   tst-sysconf-empty-chroot tst-glob_symlinks tst-fexecve \
 		   tst-glob-tilde test-ssize-max tst-spawn4 bug-regex37 \
 		   bug-regex38 tst-regcomp-truncated tst-spawn-chdir \
-		   tst-spawn5
+		   tst-spawn5 \
+		   tst-spawn6
 tests-internal	:= bug-regex5 bug-regex20 bug-regex33 \
 		   tst-rfc3484 tst-rfc3484-2 tst-rfc3484-3 \
 		   tst-glob_lstat_compat tst-spawn4-compat
@@ -255,6 +256,7 @@ tst-exec-ARGS = -- $(host-test-program-cmd)
 tst-exec-static-ARGS = $(tst-exec-ARGS)
 tst-execvpe5-ARGS = -- $(host-test-program-cmd)
 tst-spawn-ARGS = -- $(host-test-program-cmd)
+tst-spawn6-ARGS = -- $(host-test-program-cmd)
 tst-spawn-static-ARGS = $(tst-spawn-ARGS)
 tst-spawn5-ARGS = -- $(host-test-program-cmd)
 tst-dir-ARGS = `pwd` `cd $(common-objdir)/$(subdir); pwd` `cd $(common-objdir); pwd` $(objpfx)tst-dir
diff --git a/posix/tst-spawn6.c b/posix/tst-spawn6.c
new file mode 100644
index 0000000000..466e66f104
--- /dev/null
+++ b/posix/tst-spawn6.c
@@ -0,0 +1,220 @@
+/* Tests for posix_spawn signal handling.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <spawn.h>
+#include <sys/wait.h>
+
+#include <support/check.h>
+#include <support/xunistd.h>
+#include <support/support.h>
+#include <array_length.h>
+
+/* Nonzero if the program gets called via `exec'.  */
+static int restart;
+#define CMDLINE_OPTIONS \
+  { "restart", no_argument, &restart, 1 },
+
+enum spawn_test_t
+{
+  SPAWN_SETSIGMASK,
+  SPAWN_SETSIGDEF
+};
+
+static int signal_to_check[] =
+{
+  SIGHUP, SIGINT, SIGALRM, SIGUSR2
+};
+
+/* Called on process re-execution.  */
+static int
+handle_restart (enum spawn_test_t test)
+{
+  switch (test)
+    {
+    case SPAWN_SETSIGMASK:
+      {
+	sigset_t mask;
+	sigprocmask (SIG_BLOCK, NULL, &mask);
+	for (int i = 0; i < array_length (signal_to_check); i++)
+	  if (sigismember (&mask, signal_to_check[i]) != 1)
+	    exit (EXIT_FAILURE);
+      }
+      break;
+    case SPAWN_SETSIGDEF:
+      {
+	for (int i = 0; i < array_length (signal_to_check); i++)
+	  {
+	    struct sigaction act;
+	    if (sigaction (signal_to_check[i], NULL, &act) != 0)
+	      exit (EXIT_FAILURE);
+	    if (act.sa_handler != SIG_DFL)
+	      exit (EXIT_FAILURE);
+	  }
+      }
+      break;
+    }
+
+  return 0;
+}
+
+/* Common argument used for process re-execution.  */
+static char *initial_spargv[5];
+static size_t initial_spargv_size;
+
+/* Re-execute the test process with both '--direct', '--restart', and the
+   TEST (as integer value) as arguments.  */
+static void
+reexecute (enum spawn_test_t test, const posix_spawnattr_t *attrp)
+{
+  char *spargv[8];
+  int i;
+
+  for (i = 0; i < initial_spargv_size; i++)
+    spargv[i] = initial_spargv[i];
+  /* Three digits per byte plus null terminator.  */
+  char teststr[3 * sizeof (test) + 1];
+  snprintf (teststr, array_length (teststr), "%d", test);
+  spargv[i++] = teststr;
+  spargv[i] = NULL;
+  TEST_VERIFY (i < 8);
+
+  pid_t pid;
+  int status;
+
+  TEST_COMPARE (posix_spawn (&pid, spargv[0], NULL, attrp, spargv, environ),
+		0);
+  TEST_COMPARE (xwaitpid (pid, &status, 0), pid);
+  TEST_VERIFY (WIFEXITED (status));
+  TEST_VERIFY (!WIFSIGNALED (status));
+  TEST_COMPARE (WEXITSTATUS (status), 0);
+}
+
+/* Test if POSIX_SPAWN_SETSIGMASK change the spawn process signal mask to
+   the value blocked signals defined by SIGNAL_TO_CHECK signals.  */
+static void
+do_test_setsigmask (void)
+{
+  posix_spawnattr_t attr;
+  /* posix_spawnattr_init does not fail.  */
+  posix_spawnattr_init (&attr);
+
+  {
+    sigset_t mask;
+    TEST_COMPARE (sigemptyset (&mask), 0);
+    for (int i = 0; i < array_length (signal_to_check); i++)
+      TEST_COMPARE (sigaddset (&mask, signal_to_check[i]), 0);
+    TEST_COMPARE (posix_spawnattr_setsigmask (&attr, &mask), 0);
+    TEST_COMPARE (posix_spawnattr_setflags (&attr, POSIX_SPAWN_SETSIGMASK), 0);
+  }
+
+  /* Change current mask to be different than the one asked for spawned
+     process.  */
+  {
+    sigset_t empty_mask, current_mask;
+    TEST_COMPARE (sigemptyset (&empty_mask), 0);
+    TEST_COMPARE (sigprocmask (SIG_BLOCK, &empty_mask, &current_mask), 0);
+
+    reexecute (SPAWN_SETSIGMASK, &attr);
+
+    TEST_COMPARE (sigprocmask (SIG_SETMASK, &current_mask, NULL), 0);
+  }
+}
+
+/* Test if POSIX_SPAWN_SETSIGDEF change the spawn process signal actions
+   defined by SIGNAL_TO_CHECK signals to default actions.  */
+static void
+do_test_setsigdef (void)
+{
+  posix_spawnattr_t attr;
+  /* posix_spawnattr_init does not fail.  */
+  posix_spawnattr_init (&attr);
+
+  {
+    sigset_t mask;
+    TEST_COMPARE (sigemptyset (&mask), 0);
+    for (int i = 0; i < array_length (signal_to_check); i++)
+      TEST_COMPARE (sigaddset (&mask, signal_to_check[i]), 0);
+    TEST_COMPARE (posix_spawnattr_setsigdefault (&attr, &mask), 0);
+    TEST_COMPARE (posix_spawnattr_setflags (&attr, POSIX_SPAWN_SETSIGDEF), 0);
+  }
+
+  /* Change current signal disposition to be different than the one asked for
+     spawned process.  */
+  struct sigaction default_act[array_length (signal_to_check)];
+  {
+    sigset_t empty_mask;
+    TEST_COMPARE (sigemptyset (&empty_mask), 0);
+    for (int i = 0; i < array_length (signal_to_check); i++)
+      TEST_COMPARE (sigaction (signal_to_check[i],
+			       &((struct sigaction) { .sa_handler = SIG_IGN,
+						      .sa_mask = empty_mask,
+						      .sa_flags = 0 }),
+			       &default_act[i]),
+		    0);
+  }
+
+  reexecute (SPAWN_SETSIGDEF, &attr);
+
+  /* Restore signal dispositions.  */
+  for (int i = 0; i < array_length (signal_to_check); i++)
+    TEST_COMPARE (sigaction (signal_to_check[i], &default_act[i], NULL), 0);
+}
+
+static int
+do_test (int argc, char *argv[])
+{
+  /* We must have one or four parameters left if called initially:
+       + path for ld.so		optional
+       + "--library-path"	optional
+       + the library path	optional
+       + the application name
+
+     Plus one parameter to indicate which test to execute through
+     re-execution.
+
+     So for default usage without --enable-hardcoded-path-in-tests, it
+     will be called initially with 5 arguments and later with 2.  For
+     --enable-hardcoded-path-in-tests it will be called with 2 arguments
+     regardless.  */
+
+  if (argc != (restart ? 2 : 5) && argc != 2)
+    FAIL_EXIT1 ("wrong number of arguments (%d)", argc);
+
+  if (restart)
+    return handle_restart (atoi (argv[1]));
+
+  {
+    int i;
+    for (i = 0; i < (argc == 5 ? 4 : 1); i++)
+      initial_spargv[i] = argv[i + 1];
+    initial_spargv[i++] = (char *) "--direct";
+    initial_spargv[i++] = (char *) "--restart";
+    initial_spargv_size = i;
+  }
+
+  do_test_setsigmask ();
+  do_test_setsigdef ();
+
+  return 0;
+}
+
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>
diff --git a/sysdeps/generic/sigsetops.h b/sysdeps/generic/sigsetops.h
index ddeeb0b0d5..9cae11771b 100644
--- a/sysdeps/generic/sigsetops.h
+++ b/sysdeps/generic/sigsetops.h
@@ -66,6 +66,13 @@
     0;						\
   }))
 
+# define __sigorset_atomic(set)			\
+  (__extension__ ({				\
+    __sigset_t __mask = __sigmask (sig);	\
+    atomic_fetch_or_seq_cst (set, mask);	\
+    0;						\
+  }))
+
 # define __sigdelset(set, sig)			\
   (__extension__ ({				\
     __sigset_t __mask = __sigmask (sig);	\
diff --git a/sysdeps/unix/sysv/linux/internal-signals.h b/sysdeps/unix/sysv/linux/internal-signals.h
index 3562011d21..385442f81e 100644
--- a/sysdeps/unix/sysv/linux/internal-signals.h
+++ b/sysdeps/unix/sysv/linux/internal-signals.h
@@ -88,4 +88,7 @@ __libc_signal_restore_set (const sigset_t *set)
 /* Used to communicate with signal handler.  */
 extern struct xid_command *__xidcmd attribute_hidden;
 
+/* Used to obtained the modified signal handlers.  */
+extern void __get_sighandler_set (sigset_t *set) attribute_hidden;
+
 #endif
diff --git a/sysdeps/unix/sysv/linux/sigaction.c b/sysdeps/unix/sysv/linux/sigaction.c
index 52722b08ae..3bcf3946ab 100644
--- a/sysdeps/unix/sysv/linux/sigaction.c
+++ b/sysdeps/unix/sysv/linux/sigaction.c
@@ -20,6 +20,7 @@
 #include <string.h>
 
 #include <sysdep.h>
+#include <sigsetops.h>
 #include <sys/syscall.h>
 
 /* New ports should not define the obsolete SA_RESTORER, however some
@@ -36,6 +37,13 @@
 # define STUB(act, sigsetsize) (sigsetsize)
 #endif
 
+static sigset_t handler_set;
+
+void __get_sighandler_set (sigset_t *set)
+{
+  *set = handler_set;
+}
+
 /* If ACT is not NULL, change the action for SIG to *ACT.
    If OACT is not NULL, put the old action for SIG in *OACT.  */
 int
@@ -47,6 +55,15 @@ __libc_sigaction (int sig, const struct sigaction *act, struct sigaction *oact)
 
   if (act)
     {
+      /* Tracks which signal had a signal handler set different from default
+	 (SIG_DFL/SIG_IGN).  It allows optimize posix_spawn to reset only
+	 those signals.  It might incur in false positive, since it not easy
+	 to remove bits from the mask without race conditions, but it does not
+	 allow false negative since the mask is updated atomically prior the
+	 syscall.  The false positive incur in just extra sigactions on
+	 posix_spawn.  */
+      if (act->sa_handler != SIG_DFL && act->sa_handler != SIG_IGN)
+	__sigaddset_atomic (&handler_set, sig);
       kact.k_sa_handler = act->sa_handler;
       memcpy (&kact.sa_mask, &act->sa_mask, sizeof (sigset_t));
       kact.sa_flags = act->sa_flags;
diff --git a/sysdeps/unix/sysv/linux/sigsetops.h b/sysdeps/unix/sysv/linux/sigsetops.h
index 713d4840d8..6c98c83e42 100644
--- a/sysdeps/unix/sysv/linux/sigsetops.h
+++ b/sysdeps/unix/sysv/linux/sigsetops.h
@@ -20,6 +20,7 @@
 #define _SIGSETOPS_H 1
 
 #include <signal.h>
+#include <atomic.h>
 
 /* Return a mask that includes the bit for SIG only.  */
 # define __sigmask(sig) \
@@ -80,6 +81,12 @@
     (void)0;							\
   }))
 
+# define __sigorset_atomic(dest, left, right)	\
+  (__extension__ ({				\
+     atomic_fetch_or_seq_cst (dest, left, right); \
+    0;						\
+  }))
+
 /* These macros needn't check for a bogus signal number;
    error checking is done in the non-__ versions.  */
 # define __sigismember(set, sig)				\
@@ -97,6 +104,14 @@
     (void)0;							\
   }))
 
+# define __sigaddset_atomic(set, sig)				\
+  (__extension__ ({						\
+    unsigned long int __mask = __sigmask (sig);			\
+    unsigned long int __word = __sigword (sig);			\
+    atomic_fetch_or_seq_cst (&((set)->__val[__word]), __mask);	\
+    (void)0;							\
+  }))
+
 # define __sigdelset(set, sig)					\
   (__extension__ ({						\
     unsigned long int __mask = __sigmask (sig);			\
diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
index 0f7a8ca5df..264edd09c6 100644
--- a/sysdeps/unix/sysv/linux/spawni.c
+++ b/sysdeps/unix/sysv/linux/spawni.c
@@ -132,17 +132,14 @@ spawni_child (void *arguments)
   const posix_spawnattr_t *restrict attr = args->attr;
   const posix_spawn_file_actions_t *file_actions = args->fa;
 
-  /* The child must ensure that no signal handler are enabled because it shared
+  /* The child must ensure that no signal handler are enabled because it share
      memory with parent, so the signal disposition must be either SIG_DFL or
-     SIG_IGN.  It does by iterating over all signals and although it could
-     possibly be more optimized (by tracking which signal potentially have a
-     signal handler), it might requires system specific solutions (since the
-     sigset_t data type can be very different on different architectures).  */
+     SIG_IGN.  */
   struct sigaction sa;
   memset (&sa, '\0', sizeof (sa));
 
   sigset_t hset;
-  __sigprocmask (SIG_BLOCK, 0, &hset);
+  __get_sighandler_set (&hset);
   for (int sig = 1; sig < _NSIG; ++sig)
     {
       if ((attr->__flags & POSIX_SPAWN_SETSIGDEF)
-- 
2.17.1

^ permalink raw reply	[flat|nested] 59+ messages in thread

* [PATCH v2 2/5] posix: Add posix_spawn_file_actions_closefrom
  2019-07-31 18:31 [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback Adhemerval Zanella
  2019-07-31 18:31 ` [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls Adhemerval Zanella
  2019-07-31 18:31 ` [PATCH v2 3/5] posix: Optimize stack Linux posix_spawn Adhemerval Zanella
@ 2019-07-31 18:31 ` Adhemerval Zanella
  2019-08-28 14:09   ` Adhemerval Zanella
  2019-07-31 18:31 ` [PATCH v2 5/5] posix: Use posix_spawn for wordexp Adhemerval Zanella
                   ` (4 subsequent siblings)
  7 siblings, 1 reply; 59+ messages in thread
From: Adhemerval Zanella @ 2019-07-31 18:31 UTC (permalink / raw)
  To: libc-alpha

Changes from previous version:

  - Remove __spawn_valid_fd check on posix_spawn_file_actions_addclosefrom.

  - Rename __SUPPORT_SPAWN_CLOSEFROM macro to __SPAWN_SUPPORT_CLOSEFROM.

  - Use getdents64 instead of changing add internal __opendir_inplace.

Regarding the lseek to reset the descriptor offset after each file removal,
I think that by iteracting the descriptor based on getdents64 results there
is no real need for it.  The kernel returns the fds sequentially and there
is no open operations done concurrently, so both my expectations and the
tests result is getdents returns an updated version after a close() call.

--

This patch adds a way to close a range of file descriptors on posix_spawn
as a new file action.  The API is similar to the one provided by Solaris
11 [1], where the file action causes the all open file descriptors greater
than or equal to input on to be closed when the new process is spawned.

There are some discussions on BZ#10353 [2], although the bug itself asks
for a generic solution (similar to the closeall provided by some BSD).
The posix_spawn is safe to be implemented by interacting over /proc/self/fd,
the Linux spawni.c does not use CLONE_FILES, so the helper process has its
own file descriptor table and any failure (in /proc operation) aborts the
process creation and returns an error to the caller.

I am aware that this file action might be redundant to the current approach
of POSIX in promoting O_CLOEXEC in more interfaces. However O_CLOEXEC is still
not the default and for some specific usages, the caller needs to close all
possible file descriptors to avoid them leaking.  Some examples are CPython
(discussed in BZ#10353) and OpenJDK jspawnhelper [3] (where OpenJDK spawns a
helper process to exactly closes all file descriptors).  Most likely any
environment which calls functions that might open file descriptor under the
hood and aim to use posix_spawn might face the same requirement.

Checked on x86_64-linux-gnu, i686-linux-gnu, powerpc64le-linux-gnu, and
aarch64-linux-gnu.

	* posix/Makefile (routines): Add spawn_faction_addclosefrom.
	(tests): Add tst-spawn5.
	(tst-spawn5-ARGS): New rule.
	* posix/Versions [GLIBC_2.30] (libc): Add
	posix_spawn_file_actions_addclosefrom_np.
	* posix/spawn.h (posix_spawn_file_actions_addclosefrom_np): New
	prototype.
	* posix/spawn_faction_addclosefrom.c: New file
	* posix/spawn_faction_destroy.c (__posix_spawn_file_actions_destroy):
	Handle spawn_do_closefrom.
	* posix/spawn_int.h (__spawn_action): Add closefrom_action and
	spawn_do_closefrom.
	* posix/spawn_int_abi.h: New file.
	* sysdeps/unix/sysv/linux/spawn_int_abi.h: Likewise.
	* posix/tst-spawn5.c: Likewise.
	* sysdeps/mach/hurd/spawni.c (__spawni, __spawni_child): Handle
	spawn_do_closefrom.
	* sysdeps/posix/spawni.c (__spawni_child): Likewise.
	* sysdeps/unix/sysv/linux/spawni.c (__spawni_child, __spawnix):
	Likewise.
	(spawn_closefrom): New function.
	* sysdeps/mach/hurd/i386/libc.abilist (2.30): Add
	posix_spawn_file_actions_addclosefrom_np.
	* sysdeps/unix/sysv/linux/aarch64/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/alpha/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/arm/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/csky/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/hppa/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/i386/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/ia64/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/microblaze/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/nios2/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist:
	Likewise.
	* sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/sh/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist: Likewise.
	* sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist: Likewise.
	* sysdeps/unix/sysv/unix/sysv/linux/x86_64/64/libc.abilist: Likewise.
	* sysdeps/unix/sysv/unix/sysv/linux/x86_64/x32/libc.abilist: Likewise.
---
 posix/Makefile                                |   5 +-
 posix/Versions                                |   1 +
 posix/spawn.h                                 |   7 +
 posix/spawn_faction_addclosefrom.c            |  58 +++++
 posix/spawn_faction_destroy.c                 |   1 +
 posix/spawn_int.h                             |   6 +
 posix/spawn_int_abi.h                         |  27 +++
 posix/tst-spawn5.c                            | 210 ++++++++++++++++++
 sysdeps/mach/hurd/i386/libc.abilist           |   1 +
 sysdeps/mach/hurd/spawni.c                    |   4 +
 sysdeps/posix/spawni.c                        |   4 +
 sysdeps/unix/sysv/linux/aarch64/libc.abilist  |   1 +
 sysdeps/unix/sysv/linux/alpha/libc.abilist    |   1 +
 sysdeps/unix/sysv/linux/arm/libc.abilist      |   1 +
 sysdeps/unix/sysv/linux/csky/libc.abilist     |   1 +
 sysdeps/unix/sysv/linux/hppa/libc.abilist     |   1 +
 sysdeps/unix/sysv/linux/i386/libc.abilist     |   1 +
 sysdeps/unix/sysv/linux/ia64/libc.abilist     |   1 +
 .../sysv/linux/m68k/coldfire/libc.abilist     |   1 +
 .../unix/sysv/linux/m68k/m680x0/libc.abilist  |   1 +
 .../unix/sysv/linux/microblaze/libc.abilist   |   1 +
 .../sysv/linux/mips/mips32/fpu/libc.abilist   |   1 +
 .../sysv/linux/mips/mips32/nofpu/libc.abilist |   1 +
 .../sysv/linux/mips/mips64/n32/libc.abilist   |   1 +
 .../sysv/linux/mips/mips64/n64/libc.abilist   |   1 +
 sysdeps/unix/sysv/linux/nios2/libc.abilist    |   1 +
 .../linux/powerpc/powerpc32/fpu/libc.abilist  |   1 +
 .../linux/powerpc/powerpc64/be/libc.abilist   |   1 +
 .../linux/powerpc/powerpc64/le/libc.abilist   |   1 +
 .../unix/sysv/linux/riscv/rv64/libc.abilist   |   1 +
 .../unix/sysv/linux/s390/s390-32/libc.abilist |   1 +
 .../unix/sysv/linux/s390/s390-64/libc.abilist |   1 +
 sysdeps/unix/sysv/linux/sh/libc.abilist       |   1 +
 .../sysv/linux/sparc/sparc32/libc.abilist     |   1 +
 .../sysv/linux/sparc/sparc64/libc.abilist     |   1 +
 sysdeps/unix/sysv/linux/spawn_int_abi.h       |  25 +++
 sysdeps/unix/sysv/linux/spawni.c              |  62 +++++-
 .../unix/sysv/linux/x86_64/64/libc.abilist    |   1 +
 .../unix/sysv/linux/x86_64/x32/libc.abilist   |   1 +
 39 files changed, 425 insertions(+), 12 deletions(-)
 create mode 100644 posix/spawn_faction_addclosefrom.c
 create mode 100644 posix/spawn_int_abi.h
 create mode 100644 posix/tst-spawn5.c
 create mode 100644 sysdeps/unix/sysv/linux/spawn_int_abi.h

diff --git a/posix/Makefile b/posix/Makefile
index 8ac6743ad7..1ac41ad85a 100644
--- a/posix/Makefile
+++ b/posix/Makefile
@@ -57,6 +57,7 @@ routines :=								      \
 	spawn_faction_init spawn_faction_destroy spawn_faction_addclose	      \
 	spawn_faction_addopen spawn_faction_adddup2 spawn_valid_fd	      \
 	spawn_faction_addchdir spawn_faction_addfchdir			      \
+	spawn_faction_addclosefrom					      \
 	spawnattr_init spawnattr_destroy				      \
 	spawnattr_getdefault spawnattr_setdefault			      \
 	spawnattr_getflags spawnattr_setflags				      \
@@ -100,7 +101,8 @@ tests		:= test-errno tstgetopt testfnm runtests runptests \
 		   tst-posix_fadvise tst-posix_fadvise64 \
 		   tst-sysconf-empty-chroot tst-glob_symlinks tst-fexecve \
 		   tst-glob-tilde test-ssize-max tst-spawn4 bug-regex37 \
-		   bug-regex38 tst-regcomp-truncated tst-spawn-chdir
+		   bug-regex38 tst-regcomp-truncated tst-spawn-chdir \
+		   tst-spawn5
 tests-internal	:= bug-regex5 bug-regex20 bug-regex33 \
 		   tst-rfc3484 tst-rfc3484-2 tst-rfc3484-3 \
 		   tst-glob_lstat_compat tst-spawn4-compat
@@ -254,6 +256,7 @@ tst-exec-static-ARGS = $(tst-exec-ARGS)
 tst-execvpe5-ARGS = -- $(host-test-program-cmd)
 tst-spawn-ARGS = -- $(host-test-program-cmd)
 tst-spawn-static-ARGS = $(tst-spawn-ARGS)
+tst-spawn5-ARGS = -- $(host-test-program-cmd)
 tst-dir-ARGS = `pwd` `cd $(common-objdir)/$(subdir); pwd` `cd $(common-objdir); pwd` $(objpfx)tst-dir
 tst-chmod-ARGS = $(objdir)
 tst-vfork3-ARGS = --test-dir=$(objpfx)
diff --git a/posix/Versions b/posix/Versions
index 7d06a6d0c0..c8268e5996 100644
--- a/posix/Versions
+++ b/posix/Versions
@@ -146,6 +146,7 @@ libc {
     posix_spawn_file_actions_addfchdir_np;
   }
   GLIBC_2.30 {
+    posix_spawn_file_actions_addclosefrom_np;
   }
   GLIBC_PRIVATE {
     __libc_fork; __libc_pread; __libc_pwrite;
diff --git a/posix/spawn.h b/posix/spawn.h
index 471dbea022..773f416b2e 100644
--- a/posix/spawn.h
+++ b/posix/spawn.h
@@ -213,6 +213,13 @@ extern int posix_spawn_file_actions_addchdir_np (posix_spawn_file_actions_t *
 extern int posix_spawn_file_actions_addfchdir_np (posix_spawn_file_actions_t *,
 						  int __fd)
      __THROW __nonnull ((1));
+
+/* Add an action to close all file descriptor greater than FROM during
+   spawn.  This affects the subsequent file actions.  */
+extern int posix_spawn_file_actions_addclosefrom_np (posix_spawn_file_actions_t *,
+						     int __from)
+     __THROW __nonnull ((1));
+
 #endif
 
 __END_DECLS
diff --git a/posix/spawn_faction_addclosefrom.c b/posix/spawn_faction_addclosefrom.c
new file mode 100644
index 0000000000..52e949c8b3
--- /dev/null
+++ b/posix/spawn_faction_addclosefrom.c
@@ -0,0 +1,58 @@
+/* Add a closefrom to a file action list for posix_spawn.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <errno.h>
+#include <spawn.h>
+#include <unistd.h>
+#include <spawn_int.h>
+
+int
+__posix_spawn_file_actions_addclosefrom (posix_spawn_file_actions_t
+					 *file_actions, int from)
+{
+#if __SPAWN_SUPPORT_CLOSEFROM
+  struct __spawn_action *rec;
+
+  if (fd < 0)
+    return EBADF;
+
+  /* Allocate more memory if needed.  */
+  if (file_actions->__used == file_actions->__allocated
+      && __posix_spawn_file_actions_realloc (file_actions) != 0)
+    /* This can only mean we ran out of memory.  */
+    return ENOMEM;
+
+  /* Add the new value.  */
+  rec = &file_actions->__actions[file_actions->__used];
+  rec->tag = spawn_do_closefrom;
+  rec->action.closefrom_action.from = from;
+
+  /* Account for the new entry.  */
+  ++file_actions->__used;
+
+  return 0;
+#else
+  __set_errno (EINVAL);
+  return -1;
+#endif
+}
+weak_alias (__posix_spawn_file_actions_addclosefrom,
+	    posix_spawn_file_actions_addclosefrom_np)
+#if !__SPAWN_SUPPORT_CLOSEFROM
+stub_warning (posix_spawn_file_actions_addclosefrom_np)
+#endif
diff --git a/posix/spawn_faction_destroy.c b/posix/spawn_faction_destroy.c
index 51fab13585..b45d1cd889 100644
--- a/posix/spawn_faction_destroy.c
+++ b/posix/spawn_faction_destroy.c
@@ -39,6 +39,7 @@ __posix_spawn_file_actions_destroy (posix_spawn_file_actions_t *file_actions)
 	case spawn_do_close:
 	case spawn_do_dup2:
 	case spawn_do_fchdir:
+	case spawn_do_closefrom:
 	  /* No cleanup required.  */
 	  break;
 	}
diff --git a/posix/spawn_int.h b/posix/spawn_int.h
index 93b7597f90..0bc29226e4 100644
--- a/posix/spawn_int.h
+++ b/posix/spawn_int.h
@@ -20,6 +20,7 @@
 #define _SPAWN_INT_H
 
 #include <spawn.h>
+#include <spawn_int_abi.h>
 #include <stdbool.h>
 
 /* Data structure to contain the action information.  */
@@ -32,6 +33,7 @@ struct __spawn_action
     spawn_do_open,
     spawn_do_chdir,
     spawn_do_fchdir,
+    spawn_do_closefrom,
   } tag;
 
   union
@@ -60,6 +62,10 @@ struct __spawn_action
     {
       int fd;
     } fchdir_action;
+    struct
+    {
+      int from;
+    } closefrom_action;
   } action;
 };
 
diff --git a/posix/spawn_int_abi.h b/posix/spawn_int_abi.h
new file mode 100644
index 0000000000..142efed339
--- /dev/null
+++ b/posix/spawn_int_abi.h
@@ -0,0 +1,27 @@
+/* Internal ABI specific for posix_spawn functionality.  Generic version.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _SPAWN_INT_ABI_H
+#define _SPAWN_INT_ABI_H
+
+/* The closefrom file actions requires either a syscall or an arch-specific
+   way to interact over all file descriptors and act uppon them (such
+   /proc/self/fd on Linux).  */
+#define __SPAWN_SUPPOR_CLOSEFROM 0
+
+#endif /* _SPAWN_INT_H */
diff --git a/posix/tst-spawn5.c b/posix/tst-spawn5.c
new file mode 100644
index 0000000000..7af33a4dbe
--- /dev/null
+++ b/posix/tst-spawn5.c
@@ -0,0 +1,210 @@
+/* Tests for posix_spawn signal handling.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <spawn.h>
+#include <fcntl.h>
+#include <sys/wait.h>
+#include <dirent.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <limits.h>
+
+#include <support/check.h>
+#include <support/xunistd.h>
+#include <support/support.h>
+#include <array_length.h>
+
+/* Nonzero if the program gets called via `exec'.  */
+static int restart;
+#define CMDLINE_OPTIONS \
+  { "restart", no_argument, &restart, 1 },
+
+/* Called on process re-execution.  */
+static int
+handle_restart (int from)
+{
+  DIR *fds = opendir ("/proc/self/fd");
+  if (fds == NULL)
+    FAIL_EXIT1 ("opendir (\"/proc/self/fd\"): %m");
+
+  while (true)
+    {
+      errno = 0;
+      struct dirent64 *e = readdir64 (fds);
+      if (e == NULL)
+        {
+          if (errno != 0)
+            FAIL_EXIT1 ("readdir: %m");
+          break;
+        }
+
+      if (e->d_name[0] == '.')
+        continue;
+
+      char *endptr;
+      long int fd = strtol (e->d_name, &endptr, 10);
+      if (*endptr != '\0' || fd < 0 || fd > INT_MAX)
+        FAIL_EXIT1 ("readdir: invalid file descriptor name: /proc/self/fd/%s",
+                    e->d_name);
+
+      /* Skip the descriptor which is used to enumerate the
+         descriptors.  */
+      if (fd == dirfd (fds))
+        continue;
+
+      struct stat64 st;
+      if (fstat64 (fd, &st) != 0)
+        FAIL_EXIT1 ("readdir: fstat64 (%ld) failed: %m", fd);
+
+      if (fd >= from)
+	FAIL_EXIT1 ("error: fd (%ld) greater than from (%d)", fd, from);
+    }
+
+  closedir (fds);
+
+  return 0;
+}
+
+/* Common argument used for process re-execution.  */
+static char *initial_spargv[5];
+static size_t initial_spargv_size;
+
+/* Re-execute the test process with both '--direct', '--restart', and the
+   TEST (as integer value) as arguments.  */
+static void
+reexecute (int fd, const posix_spawn_file_actions_t *fa)
+{
+  char *spargv[8];
+  int i;
+
+  for (i = 0; i < initial_spargv_size; i++)
+    spargv[i] = initial_spargv[i];
+  /* Three digits per byte plus null terminator.  */
+  char teststr[3 * sizeof (fd) + 1];
+  snprintf (teststr, array_length (teststr), "%d", fd);
+  spargv[i++] = teststr;
+  spargv[i] = NULL;
+  TEST_VERIFY (i < 8);
+
+  pid_t pid;
+  int status;
+
+  TEST_COMPARE (posix_spawn (&pid, spargv[0], fa, NULL, spargv, environ),
+		0);
+  TEST_COMPARE (xwaitpid (pid, &status, 0), pid);
+  TEST_VERIFY (WIFEXITED (status));
+  TEST_VERIFY (!WIFSIGNALED (status));
+  TEST_COMPARE (WEXITSTATUS (status), 0);
+}
+
+static void
+do_test_closefrom (int num_fd_to_open)
+{
+  int *fds = xmalloc (num_fd_to_open * sizeof (int));
+  for (int i = 0; i < num_fd_to_open; i++)
+    fds[i] = xopen ("/dev/null", O_WRONLY, 0);
+
+  posix_spawn_file_actions_t fa;
+  /* posix_spawn_file_actions_init does not fail.  */
+  posix_spawn_file_actions_init (&fa);
+
+  {
+    int ret = posix_spawn_file_actions_addclosefrom_np (&fa, fds[0]);
+    if (ret == -1)
+      {
+	if (errno == ENOSYS)
+	  /* Hurd currently does not support closefrom fileaction.  */
+	  FAIL_UNSUPPORTED ("posix_spawn_file_actions_addclosefrom_np unsupported");
+        else
+	  FAIL_EXIT1 ("posix_spawn_file_actions_addclosefrom_np failed");
+      }
+  }
+
+  /* Default check, all file descriptor from [fd[0], fd[1]) are opened.  */
+  reexecute (fds[0], &fa);
+
+  /* Add a gap in the range.  */
+  xclose (fds[num_fd_to_open/2]);
+  xclose (fds[num_fd_to_open/2 + 1]);
+  reexecute (fds[0], &fa);
+
+  /* Add another gap, at the beginning.  */
+  xclose (fds[0]);
+  xclose (fds[1]);
+  reexecute (fds[0], &fa);
+
+  /* Add another gap, now at the end.  */
+  xclose (fds[num_fd_to_open-1]);
+  xclose (fds[num_fd_to_open-2]);
+  reexecute (fds[0], &fa);
+
+  /* Open some more files, filling the gaps.  */
+  for (int i = 0; i < 6; i++)
+    xopen ("/dev/null", O_WRONLY, 0);
+  reexecute (fds[0], &fa);
+
+  /* Open some more, but with O_CLOEXEC.  */
+  for (int i = 0; i < num_fd_to_open/2; i++)
+    xopen ("/dev/null", O_WRONLY | O_CLOEXEC, 0);
+
+  free (fds);
+}
+
+
+static int
+do_test (int argc, char *argv[])
+{
+  /* We must have one or four parameters left if called initially:
+       + path for ld.so		optional
+       + "--library-path"	optional
+       + the library path	optional
+       + the application name
+
+     Plus one parameter to indicate which test to execute through
+     re-execution.
+
+     So for default usage without --enable-hardcoded-path-in-tests, it
+     will be called initially with 5 arguments and later with 2.  For
+     --enable-hardcoded-path-in-tests it will be called with 2 arguments
+     regardless.  */
+
+  if (argc != (restart ? 2 : 5) && argc != 2)
+    FAIL_EXIT1 ("wrong number of arguments (%d)", argc);
+
+  if (restart)
+    return handle_restart (atoi (argv[1]));
+
+  /* Respawn using the same arguments.  */
+  for (initial_spargv_size = 0;
+       initial_spargv_size < (argc == 5 ? 4 : 1);
+       initial_spargv_size++)
+    initial_spargv[initial_spargv_size] = argv[initial_spargv_size + 1];
+  initial_spargv[initial_spargv_size++] = (char *) "--direct";
+  initial_spargv[initial_spargv_size++] = (char *) "--restart";
+
+  do_test_closefrom (10);
+  do_test_closefrom (100);
+
+  return 0;
+}
+
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>
diff --git a/sysdeps/mach/hurd/i386/libc.abilist b/sysdeps/mach/hurd/i386/libc.abilist
index 1fc7ab2433..fcf957cfc6 100644
--- a/sysdeps/mach/hurd/i386/libc.abilist
+++ b/sysdeps/mach/hurd/i386/libc.abilist
@@ -2175,6 +2175,7 @@ GLIBC_2.3.4 setipv4sourcefilter F
 GLIBC_2.3.4 setsourcefilter F
 GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 __confstr_chk F
 GLIBC_2.4 __fgets_chk F
diff --git a/sysdeps/mach/hurd/spawni.c b/sysdeps/mach/hurd/spawni.c
index e8024a2679..720e56ffb3 100644
--- a/sysdeps/mach/hurd/spawni.c
+++ b/sysdeps/mach/hurd/spawni.c
@@ -597,6 +597,10 @@ __spawni (pid_t *pid, const char *file,
 	  case spawn_do_fchdir:
 	    err = child_fchdir (action->action.fchdir_action.fd);
 	    break;
+
+	  case spawn_do_closefrom:
+	    err = EINVAL;
+	    break;
 	  }
 
 	if (err)
diff --git a/sysdeps/posix/spawni.c b/sysdeps/posix/spawni.c
index a5913feb14..3beaba91db 100644
--- a/sysdeps/posix/spawni.c
+++ b/sysdeps/posix/spawni.c
@@ -231,6 +231,10 @@ __spawni_child (void *arguments)
 	      if (__fchdir (action->action.fchdir_action.fd) != 0)
 		goto fail;
 	      break;
+
+	    case spawn_do_closefrom:
+	      __set_errno (EINVAL);
+	      goto fail;
 	    }
 	}
     }
diff --git a/sysdeps/unix/sysv/linux/aarch64/libc.abilist b/sysdeps/unix/sysv/linux/aarch64/libc.abilist
index a4c31932cb..e1e793b348 100644
--- a/sysdeps/unix/sysv/linux/aarch64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/aarch64/libc.abilist
@@ -2143,5 +2143,6 @@ GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
diff --git a/sysdeps/unix/sysv/linux/alpha/libc.abilist b/sysdeps/unix/sysv/linux/alpha/libc.abilist
index fe85a35620..735e54f433 100644
--- a/sysdeps/unix/sysv/linux/alpha/libc.abilist
+++ b/sysdeps/unix/sysv/linux/alpha/libc.abilist
@@ -2218,6 +2218,7 @@ GLIBC_2.30 __nldbl_warn F
 GLIBC_2.30 __nldbl_warnx F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 _IO_fprintf F
diff --git a/sysdeps/unix/sysv/linux/arm/libc.abilist b/sysdeps/unix/sysv/linux/arm/libc.abilist
index bc3df8dcea..a3b9db6efa 100644
--- a/sysdeps/unix/sysv/linux/arm/libc.abilist
+++ b/sysdeps/unix/sysv/linux/arm/libc.abilist
@@ -128,6 +128,7 @@ GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 _Exit F
diff --git a/sysdeps/unix/sysv/linux/csky/libc.abilist b/sysdeps/unix/sysv/linux/csky/libc.abilist
index 9b3cee65bb..88d112b9b1 100644
--- a/sysdeps/unix/sysv/linux/csky/libc.abilist
+++ b/sysdeps/unix/sysv/linux/csky/libc.abilist
@@ -2087,5 +2087,6 @@ GLIBC_2.29 xprt_register F
 GLIBC_2.29 xprt_unregister F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
diff --git a/sysdeps/unix/sysv/linux/hppa/libc.abilist b/sysdeps/unix/sysv/linux/hppa/libc.abilist
index 75edece94a..2925c96183 100644
--- a/sysdeps/unix/sysv/linux/hppa/libc.abilist
+++ b/sysdeps/unix/sysv/linux/hppa/libc.abilist
@@ -2039,6 +2039,7 @@ GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 __confstr_chk F
diff --git a/sysdeps/unix/sysv/linux/i386/libc.abilist b/sysdeps/unix/sysv/linux/i386/libc.abilist
index edeaf8e722..9fd8ceb639 100644
--- a/sysdeps/unix/sysv/linux/i386/libc.abilist
+++ b/sysdeps/unix/sysv/linux/i386/libc.abilist
@@ -2205,6 +2205,7 @@ GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 __confstr_chk F
diff --git a/sysdeps/unix/sysv/linux/ia64/libc.abilist b/sysdeps/unix/sysv/linux/ia64/libc.abilist
index b5d460eeb2..37d817eeb3 100644
--- a/sysdeps/unix/sysv/linux/ia64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/ia64/libc.abilist
@@ -2071,6 +2071,7 @@ GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 __confstr_chk F
diff --git a/sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist b/sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist
index 05633b3cb8..e81ab1f0bf 100644
--- a/sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist
+++ b/sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist
@@ -129,6 +129,7 @@ GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 _Exit F
diff --git a/sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist b/sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist
index 47eb7b4608..cd5742bf63 100644
--- a/sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist
+++ b/sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist
@@ -2148,6 +2148,7 @@ GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 __confstr_chk F
diff --git a/sysdeps/unix/sysv/linux/microblaze/libc.abilist b/sysdeps/unix/sysv/linux/microblaze/libc.abilist
index f7ced487f7..957b14d992 100644
--- a/sysdeps/unix/sysv/linux/microblaze/libc.abilist
+++ b/sysdeps/unix/sysv/linux/microblaze/libc.abilist
@@ -2135,5 +2135,6 @@ GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
diff --git a/sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist b/sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist
index e49dc4272e..b8ffdea448 100644
--- a/sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist
+++ b/sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist
@@ -2122,6 +2122,7 @@ GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 __confstr_chk F
diff --git a/sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist b/sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist
index daa3b60c5b..e1c861720a 100644
--- a/sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist
+++ b/sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist
@@ -2120,6 +2120,7 @@ GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 __confstr_chk F
diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist b/sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist
index 457ce0b6f2..88fe3f4d26 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist
+++ b/sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist
@@ -2128,6 +2128,7 @@ GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 __confstr_chk F
diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist b/sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist
index 63d5c03bfb..7c6dafa818 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist
@@ -2122,6 +2122,7 @@ GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 __confstr_chk F
diff --git a/sysdeps/unix/sysv/linux/nios2/libc.abilist b/sysdeps/unix/sysv/linux/nios2/libc.abilist
index 7fec0c9670..487b005070 100644
--- a/sysdeps/unix/sysv/linux/nios2/libc.abilist
+++ b/sysdeps/unix/sysv/linux/nios2/libc.abilist
@@ -2176,5 +2176,6 @@ GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist
index 9200a54309..db81db978a 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist
@@ -2178,6 +2178,7 @@ GLIBC_2.30 __nldbl_warn F
 GLIBC_2.30 __nldbl_warnx F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 _IO_fprintf F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libc.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libc.abilist
index 2860df8ebc..06dfdf1fed 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libc.abilist
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libc.abilist
@@ -2041,6 +2041,7 @@ GLIBC_2.30 __nldbl_warn F
 GLIBC_2.30 __nldbl_warnx F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 _IO_fprintf F
diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libc.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libc.abilist
index 2229a1dcc0..eb0532937e 100644
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libc.abilist
+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libc.abilist
@@ -2245,5 +2245,6 @@ GLIBC_2.30 __nldbl_warn F
 GLIBC_2.30 __nldbl_warnx F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
diff --git a/sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist b/sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist
index 31010e6cf7..4985fa93e3 100644
--- a/sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist
@@ -2105,5 +2105,6 @@ GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist b/sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist
index 576295deff..1ef1b9d4cc 100644
--- a/sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist
+++ b/sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist
@@ -2173,6 +2173,7 @@ GLIBC_2.30 __nldbl_warn F
 GLIBC_2.30 __nldbl_warnx F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 _IO_fprintf F
diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist b/sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist
index abf0473683..f9d9fe68ca 100644
--- a/sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist
@@ -2077,6 +2077,7 @@ GLIBC_2.30 __nldbl_warn F
 GLIBC_2.30 __nldbl_warnx F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 _IO_fprintf F
diff --git a/sysdeps/unix/sysv/linux/sh/libc.abilist b/sysdeps/unix/sysv/linux/sh/libc.abilist
index 41977f6e9c..1b12384dd1 100644
--- a/sysdeps/unix/sysv/linux/sh/libc.abilist
+++ b/sysdeps/unix/sysv/linux/sh/libc.abilist
@@ -2043,6 +2043,7 @@ GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 __confstr_chk F
diff --git a/sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist b/sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist
index 3d2f00ca52..a7c244cb56 100644
--- a/sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist
+++ b/sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist
@@ -2167,6 +2167,7 @@ GLIBC_2.30 __nldbl_warn F
 GLIBC_2.30 __nldbl_warnx F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 _IO_fprintf F
diff --git a/sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist b/sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist
index 2f20643e8e..a71facfb43 100644
--- a/sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist
@@ -2094,6 +2094,7 @@ GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 __confstr_chk F
diff --git a/sysdeps/unix/sysv/linux/spawn_int_abi.h b/sysdeps/unix/sysv/linux/spawn_int_abi.h
new file mode 100644
index 0000000000..9c4b31ccae
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/spawn_int_abi.h
@@ -0,0 +1,25 @@
+/* Internal ABI specific for posix_spawn functionality.  Linux version.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _SPAWN_INT_ABI_H
+#define _SPAWN_INT_ABI_H
+
+/* spawni.c implements closefrom by interacting over /proc/self/fd.  */
+#define __SPAWN_SUPPORT_CLOSEFROM 1
+
+#endif /* _SPAWN_INT_H */
diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
index c1abf3f960..ca7bf99825 100644
--- a/sysdeps/unix/sysv/linux/spawni.c
+++ b/sysdeps/unix/sysv/linux/spawni.c
@@ -17,20 +17,16 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <spawn.h>
-#include <fcntl.h>
 #include <paths.h>
-#include <string.h>
+#include <dirent.h>
 #include <sys/resource.h>
-#include <sys/wait.h>
-#include <sys/param.h>
-#include <sys/mman.h>
 #include <not-cancel.h>
 #include <local-setxid.h>
 #include <shlib-compat.h>
-#include <nptl/pthreadP.h>
-#include <dl-sysdep.h>
-#include <libc-pointer-arith.h>
+#include <sigsetops.h>
+#include <internal-signals.h>
 #include <ldsodefs.h>
+#include <ctype.h>
 #include "spawn_int.h"
 
 /* The Linux implementation of posix_spawn{p} uses the clone syscall directly
@@ -114,6 +110,44 @@ maybe_script_execute (struct posix_spawn_args *args)
     }
 }
 
+/* Close all file descriptor up to FROM by interacting /proc/self/fd.  */
+static bool
+spawn_closefrom (int from)
+{
+  struct dirent64 entries[1024 / sizeof (struct dirent64)];
+
+  int dirfd = __open ("/proc/self/fd", O_RDONLY | O_DIRECTORY, 0);
+  if (dirfd == -1)
+    return false;
+
+  ssize_t r;
+  while ((r = __getdents64 (dirfd, entries, sizeof (entries))) > 0)
+    {
+      struct dirent64 *dp = entries;
+      struct dirent64 *edp = (void *)((uintptr_t) dp + r);
+
+      for (struct dirent64 *dp = entries; dp < edp;
+	   dp = (void *)((uintptr_t) dp + dp->d_reclen))
+	{
+	  int fd = 0;
+
+	  if (dp->d_name[0] == '.')
+	    continue;
+
+	  for (const char *s = dp->d_name; isdigit (*s); s++)
+	    fd = 10 * fd + (*s - '0');
+
+	  if (fd == dirfd || fd < from)
+	    continue;
+
+	  __close_nocancel (fd);
+	}
+    }
+
+  __close_nocancel (dirfd);
+  return true;
+}
+
 /* Function used in the clone call to setup the signals mask, posix_spawn
    attributes, and file actions.  It run on its own stack (provided by the
    posix_spawn call).  */
@@ -280,6 +314,11 @@ __spawni_child (void *arguments)
 	      if (__fchdir (action->action.fchdir_action.fd) != 0)
 		goto fail;
 	      break;
+
+	    case spawn_do_closefrom:
+	      if (!spawn_closefrom (action->action.closefrom_action.from))
+		goto fail;
+	      break;
 	    }
 	}
     }
@@ -339,12 +378,13 @@ __spawnix (pid_t * pid, const char *file,
   int prot = (PROT_READ | PROT_WRITE
 	     | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
 
-  /* Add a slack area for child's stack.  */
-  size_t argv_size = (argc * sizeof (void *)) + 512;
+  size_t argv_size = (argc * sizeof (void *));
   /* We need at least a few pages in case the compiler's stack checking is
      enabled.  In some configs, it is known to use at least 24KiB.  We use
      32KiB to be "safe" from anything the compiler might do.  Besides, the
-     extra pages won't actually be allocated unless they get used.  */
+     extra pages won't actually be allocated unless they get used.
+     It also acts the slack for spawn_closefrom (including MIPS64 getdents64
+     where it might use about 1k extra stack space.  */
   argv_size += (32 * 1024);
   size_t stack_size = ALIGN_UP (argv_size, GLRO(dl_pagesize));
   void *stack = __mmap (NULL, stack_size, prot,
diff --git a/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist b/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist
index 59f85d9373..78a43f5851 100644
--- a/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist
+++ b/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist
@@ -2052,6 +2052,7 @@ GLIBC_2.3.4 xdr_quad_t F
 GLIBC_2.3.4 xdr_u_quad_t F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
 GLIBC_2.4 __confstr_chk F
diff --git a/sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist b/sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist
index 67a4e238d6..b83897ddbf 100644
--- a/sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist
+++ b/sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist
@@ -2151,5 +2151,6 @@ GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
 GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
 GLIBC_2.30 getdents64 F
 GLIBC_2.30 gettid F
+GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
 GLIBC_2.30 tgkill F
 GLIBC_2.30 twalk_r F
-- 
2.17.1

^ permalink raw reply	[flat|nested] 59+ messages in thread

* [PATCH v2 5/5] posix: Use posix_spawn for wordexp
  2019-07-31 18:31 [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback Adhemerval Zanella
                   ` (2 preceding siblings ...)
  2019-07-31 18:31 ` [PATCH v2 2/5] posix: Add posix_spawn_file_actions_closefrom Adhemerval Zanella
@ 2019-07-31 18:31 ` Adhemerval Zanella
  2019-08-28 14:10   ` Adhemerval Zanella
  2019-10-07 19:33   ` Florian Weimer
  2019-08-28 14:09 ` [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback Adhemerval Zanella
                   ` (3 subsequent siblings)
  7 siblings, 2 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-07-31 18:31 UTC (permalink / raw)
  To: libc-alpha

Change from previous version:

  - Use libsupport and remove atfork usage on posix/wordexp-test.c.

--

This patch replaces the fork+exec by posix_spawn on wordexp, which
allows a better scability on Linux and simplifies the thread
cancellation handling.

The only change which can not be implemented with posix_spawn the
/dev/null check to certify it is indeed the expected device.  I am
not sure how effetive this check is since /dev/null tampering means
something very wrong with the system and this is the least of the
issues.  My view is the tests is really out of the place and the
hardening provided is minimum.

If the idea is still to provide such check, I think a possibilty
would be to open /dev/null, check it, add a dup2 file action, and
close the file descriptor.

Checked on powerpc64le-linux-gnu and x86_64-linux-gnu.

	* include/spawn.h (__posix_spawn_file_actions_addopen): New
	prototype.
	* posix/spawn_faction_addopen.c (posix_spawn_file_actions_addopen):
	Add internal alias.
	* posix/wordexp.c (create_environment, free_environment): New
	functions.
	(exec_comm_child, exec_comm): Use posix_spawn instead of fork+exec.
	* posix/wordexp-test.c: Use libsupport and remove atfork usage.
---
 include/spawn.h               |   3 +
 posix/spawn_faction_addopen.c |   8 +-
 posix/wordexp-test.c          | 142 +++++++++--------------------
 posix/wordexp.c               | 167 ++++++++++++++++------------------
 4 files changed, 129 insertions(+), 191 deletions(-)

diff --git a/include/spawn.h b/include/spawn.h
index 7fdd965bd7..4a0b1849da 100644
--- a/include/spawn.h
+++ b/include/spawn.h
@@ -11,6 +11,9 @@ __typeof (posix_spawn_file_actions_addclose)
 __typeof (posix_spawn_file_actions_adddup2)
   __posix_spawn_file_actions_adddup2 attribute_hidden;
 
+__typeof (posix_spawn_file_actions_addopen)
+  __posix_spawn_file_actions_addopen attribute_hidden;
+
 __typeof (posix_spawn_file_actions_destroy)
   __posix_spawn_file_actions_destroy attribute_hidden;
 
diff --git a/posix/spawn_faction_addopen.c b/posix/spawn_faction_addopen.c
index 742eb9526d..2e598de300 100644
--- a/posix/spawn_faction_addopen.c
+++ b/posix/spawn_faction_addopen.c
@@ -25,9 +25,9 @@
 /* Add an action to FILE-ACTIONS which tells the implementation to call
    `open' for the given file during the `spawn' call.  */
 int
-posix_spawn_file_actions_addopen (posix_spawn_file_actions_t *file_actions,
-				  int fd, const char *path, int oflag,
-				  mode_t mode)
+__posix_spawn_file_actions_addopen (posix_spawn_file_actions_t *file_actions,
+				    int fd, const char *path, int oflag,
+				    mode_t mode)
 {
   struct __spawn_action *rec;
 
@@ -60,3 +60,5 @@ posix_spawn_file_actions_addopen (posix_spawn_file_actions_t *file_actions,
 
   return 0;
 }
+weak_alias (__posix_spawn_file_actions_addopen,
+	    posix_spawn_file_actions_addopen)
diff --git a/posix/wordexp-test.c b/posix/wordexp-test.c
index 10a0768a6b..ef780b0a65 100644
--- a/posix/wordexp-test.c
+++ b/posix/wordexp-test.c
@@ -15,39 +15,21 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/mman.h>
+#include <wordexp.h>
+#include <stdio.h>
 #include <fcntl.h>
-#include <unistd.h>
 #include <pwd.h>
-#include <stdio.h>
-#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
-#include <wordexp.h>
+#include <sys/mman.h>
+
 #include <libc-pointer-arith.h>
-#include <dso_handle.h>
+#include <array_length.h>
+#include <support/xunistd.h>
+#include <support/check.h>
 
 #define IFS " \n\t"
 
-extern int __register_atfork (void (*) (void), void (*) (void), void (*) (void), void *);
-
-static int __app_register_atfork (void (*prepare) (void), void (*parent) (void), void (*child) (void))
-{
-  return __register_atfork (prepare, parent, child, __dso_handle);
-}
-
-/* Number of forks seen.  */
-static int registered_forks;
-
-/* For each fork increment the fork count.  */
-static void
-register_fork (void)
-{
-  registered_forks++;
-}
-
 struct test_case_struct
 {
   int retval;
@@ -57,7 +39,7 @@ struct test_case_struct
   size_t wordc;
   const char *wordv[10];
   const char *ifs;
-} test_case[] =
+} static test_case[] =
   {
     /* Simple word- and field-splitting */
     { 0, NULL, "one", 0, 1, { "one", }, IFS },
@@ -238,8 +220,6 @@ struct test_case_struct
     { WRDE_SYNTAX, NULL, "${", 0, 0, { NULL, }, IFS },      /* BZ 18043  */
     { WRDE_SYNTAX, NULL, "L${a:", 0, 0, { NULL, }, IFS },   /* BZ 18043#c4  */
     { WRDE_SYNTAX, NULL, "$[1/0]", WRDE_NOCMD, 0, {NULL, }, IFS }, /* BZ 18100 */
-
-    { -1, NULL, NULL, 0, 0, { NULL, }, IFS },
   };
 
 static int testit (struct test_case_struct *tc);
@@ -256,16 +236,14 @@ command_line_test (const char *words)
     printf ("we_wordv[%d] = \"%s\"\n", i, we.we_wordv[i]);
 }
 
-int
-main (int argc, char *argv[])
+static int
+do_test (int argc, char *argv[])
 {
-  const char *globfile[] = { "one", "two", "three", NULL };
+  const char *globfile[] = { "one", "two", "three" };
   char tmpdir[32];
   struct passwd *pw;
   const char *cwd;
   int test;
-  int fail = 0;
-  int i;
   struct test_case_struct ts;
 
   if (argc > 1)
@@ -278,30 +256,18 @@ main (int argc, char *argv[])
 
   /* Set up arena for pathname expansion */
   tmpnam (tmpdir);
-  if (mkdir (tmpdir, S_IRWXU) || chdir (tmpdir))
-    return -1;
-  else
-    {
-      int fd;
+  xmkdir (tmpdir, S_IRWXU);
+  TEST_VERIFY_EXIT (chdir (tmpdir) == 0);
 
-      for (i = 0; globfile[i]; ++i)
-	if ((fd = creat (globfile[i], S_IRUSR | S_IWUSR)) == -1
-	    || close (fd))
-	  return -1;
-    }
-
-  /* If we are not allowed to do command substitution, we install
-     fork handlers to verify that no forks happened.  No forks should
-     happen at all if command substitution is disabled.  */
-  if (__app_register_atfork (register_fork, NULL, NULL) != 0)
+  for (int i = 0; i < array_length (globfile); ++i)
     {
-      printf ("Failed to register fork handler.\n");
-      return -1;
+      int fd = xopen (globfile[i], O_WRONLY|O_CREAT|O_TRUNC,
+		      S_IRUSR | S_IWUSR);
+      xclose (fd);
     }
 
-  for (test = 0; test_case[test].retval != -1; test++)
-    if (testit (&test_case[test]))
-      ++fail;
+  for (test = 0; test < array_length (test_case); test++)
+    TEST_COMPARE (testit (&test_case[test]), 0);
 
   /* Tilde-expansion tests. */
   pw = getpwnam ("root");
@@ -315,8 +281,7 @@ main (int argc, char *argv[])
       ts.wordv[0] = pw->pw_dir;
       ts.ifs = IFS;
 
-      if (testit (&ts))
-	++fail;
+      TEST_COMPARE (testit (&ts), 0);
 
       ts.retval = 0;
       ts.env = pw->pw_dir;
@@ -326,8 +291,7 @@ main (int argc, char *argv[])
       ts.wordv[0] = "x";
       ts.ifs = IFS;
 
-      if (testit (&ts))
-	++fail;
+      TEST_COMPARE (testit (&ts), 0);
     }
 
   /* "~" expands to value of $HOME when HOME is set */
@@ -342,8 +306,7 @@ main (int argc, char *argv[])
   ts.wordv[1] = "/dummy/home/foo";
   ts.ifs = IFS;
 
-  if (testit (&ts))
-    ++fail;
+  TEST_COMPARE (testit (&ts), 0);
 
   /* "~" expands to home dir from passwd file if HOME is not set */
 
@@ -359,8 +322,7 @@ main (int argc, char *argv[])
       ts.wordv[0] = pw->pw_dir;
       ts.ifs = IFS;
 
-      if (testit (&ts))
-	++fail;
+      TEST_COMPARE (testit (&ts), 0);
     }
 
   /* Integer overflow in division.  */
@@ -375,37 +337,32 @@ main (int argc, char *argv[])
       "18446744073709551616",
       "170141183460469231731687303715884105728",
       "340282366920938463463374607431768211456",
-      NULL
     };
 
-    for (const char *const *num = numbers; *num; ++num)
+    for (int i = 0; i < array_length (numbers); i++)
       {
 	wordexp_t p;
 	char pattern[256];
-	snprintf (pattern, sizeof (pattern), "$[(-%s)/(-1)]", *num);
+	snprintf (pattern, sizeof (pattern), "$[(-%s)/(-1)]", numbers[i]);
 	int ret = wordexp (pattern, &p, WRDE_NOCMD);
 	if (ret == 0)
 	  {
-	    if (p.we_wordc != 1 || strcmp (p.we_wordv[0], *num) != 0)
-	      {
-		printf ("Integer overflow for \"%s\" failed", pattern);
-		++fail;
-	      }
+	    TEST_COMPARE (p.we_wordc, 1);
+	    TEST_COMPARE (strcmp (p.we_wordv[0], numbers[i]), 0);
 	    wordfree (&p);
 	  }
-	else if (ret != WRDE_SYNTAX)
+	else
 	  {
-	    printf ("Integer overflow for \"%s\" failed with %d",
-		    pattern, ret);
-	    ++fail;
+	    TEST_COMPARE (ret, WRDE_SYNTAX);
+	    if (ret != WRDE_SYNTAX)
+	      printf ("Integer overflow for \"%s\" failed with %d",
+		      pattern, ret);
 	  }
       }
   }
 
-  puts ("tests completed, now cleaning up");
-
   /* Clean up */
-  for (i = 0; globfile[i]; ++i)
+  for (int i = 0; i < array_length (globfile); ++i)
     remove (globfile[i]);
 
   if (cwd == NULL)
@@ -414,26 +371,17 @@ main (int argc, char *argv[])
   chdir (cwd);
   rmdir (tmpdir);
 
-  printf ("tests failed: %d\n", fail);
-
-  return fail != 0;
+  return 0;
 }
 
 static const char *
 at_page_end (const char *words)
 {
   const int pagesize = getpagesize ();
-  char *start = mmap (0, 2 * pagesize, PROT_READ|PROT_WRITE,
-		      MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  char *start = xmmap (0, 2 * pagesize, PROT_READ | PROT_WRITE,
+		       MAP_PRIVATE | MAP_ANONYMOUS, -1);
 
-  if (start == MAP_FAILED)
-    return start;
-
-  if (mprotect (start + pagesize, pagesize, PROT_NONE))
-    {
-      munmap (start, 2 * pagesize);
-      return MAP_FAILED;
-    }
+  xmprotect (start + pagesize, pagesize, PROT_NONE);
 
   /* Includes terminating NUL.  */
   const size_t words_size = strlen (words) + 1;
@@ -472,9 +420,6 @@ testit (struct test_case_struct *tc)
   fflush (NULL);
   const char *words = at_page_end (tc->words);
 
-  if (tc->flags & WRDE_NOCMD)
-    registered_forks = 0;
-
   if (tc->flags & WRDE_APPEND)
     {
       /* initial wordexp() call, to be appended to */
@@ -486,13 +431,6 @@ testit (struct test_case_struct *tc)
     }
   retval = wordexp (words, &we, tc->flags);
 
-  if ((tc->flags & WRDE_NOCMD)
-      && (registered_forks > 0))
-    {
-	  printf ("FAILED fork called for WRDE_NOCMD\n");
-	  return 1;
-    }
-
   if (tc->flags & WRDE_DOOFFS)
       start_offs = sav_we.we_offs;
 
@@ -551,9 +489,11 @@ testit (struct test_case_struct *tc)
   const int page_size = getpagesize ();
   char *start = (char *) PTR_ALIGN_DOWN (words, page_size);
 
-  if (munmap (start, 2 * page_size) != 0)
-    return 1;
+  xmunmap (start, 2 * page_size);
 
   fflush (NULL);
   return bzzzt;
 }
+
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>
diff --git a/posix/wordexp.c b/posix/wordexp.c
index 22c6d18a9c..e1aafcaceb 100644
--- a/posix/wordexp.c
+++ b/posix/wordexp.c
@@ -25,33 +25,18 @@
 #include <libintl.h>
 #include <paths.h>
 #include <pwd.h>
-#include <signal.h>
 #include <stdbool.h>
 #include <stdio.h>
-#include <stdlib.h>
 #include <string.h>
 #include <sys/param.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
-#include <wchar.h>
 #include <wordexp.h>
-#include <kernel-features.h>
+#include <spawn.h>
 #include <scratch_buffer.h>
-
-#include <libc-lock.h>
 #include <_itoa.h>
-
-/* Undefine the following line for the production version.  */
-/* #define NDEBUG 1 */
 #include <assert.h>
 
-/* Get some device information.  */
-#include <device-nrs.h>
-
 /*
  * This is a recursive-descent-style word expansion routine.
  */
@@ -812,61 +797,90 @@ parse_arith (char **word, size_t *word_length, size_t *max_length,
   return WRDE_SYNTAX;
 }
 
+static char **
+create_environment (void)
+{
+  size_t s = 0;
+
+  /* Calculate total environment size, including 'IFS' if is present.  */
+  for (char **ep = __environ; *ep != NULL; ep++, s++);
+
+  /* Include final NULL pointer.  */
+  char **newenviron = malloc (s * sizeof (char*));
+  if (newenviron == NULL)
+    return NULL;
+
+  /* Copy current environment excluding 'IFS', to make sure the subshell
+     doesn't field-split on our behalf. */
+  size_t i, j;
+  for (i = 0, j = 0; i < s; i++)
+    if (strncmp (__environ[i], "IFS=", sizeof ("IFS=")-1) != 0)
+      newenviron[j++] = __strdup (__environ[i]);
+  newenviron[j] = NULL;
+
+  return newenviron;
+}
+
+static void
+free_environment (char **environ)
+{
+  for (char **ep = environ; *ep != NULL; ep++)
+    free (*ep);
+  free (environ);
+}
+
 /* Function called by child process in exec_comm() */
-static inline void
-__attribute__ ((always_inline))
-exec_comm_child (char *comm, int *fildes, int showerr, int noexec)
+static pid_t
+exec_comm_child (char *comm, int *fildes, bool showerr, bool noexec)
 {
-  const char *args[4] = { _PATH_BSHELL, "-c", comm, NULL };
+  pid_t pid = -1;
 
-  /* Execute the command, or just check syntax? */
-  if (noexec)
-    args[1] = "-nc";
+  /* Execute the command, or just check syntax?  */
+  const char *args[] = { _PATH_BSHELL, noexec ? "-nc" : "-c", comm, NULL };
 
-  /* Redirect output.  */
-  if (__glibc_likely (fildes[1] != STDOUT_FILENO))
-    {
-      __dup2 (fildes[1], STDOUT_FILENO);
-      __close (fildes[1]);
-    }
-  else
-    /* Reset the close-on-exec flag (if necessary).  */
-    __fcntl (fildes[1], F_SETFD, 0);
+  posix_spawn_file_actions_t fa;
+  /* posix_spawn_file_actions_init does not fail.  */
+  __posix_spawn_file_actions_init (&fa);
 
-  /* Redirect stderr to /dev/null if we have to.  */
-  if (showerr == 0)
+  /* Redirect output.  For check syntax only (noexec being true), exec_comm
+     explicits sets fildes[1] to -1, so check its value to avoid a failure in
+     __posix_spawn_file_actions_adddup2.  */
+  if (fildes[1] != -1)
     {
-      struct stat64 st;
-      int fd;
-      __close (STDERR_FILENO);
-      fd = __open (_PATH_DEVNULL, O_WRONLY);
-      if (fd >= 0 && fd != STDERR_FILENO)
+      if (__glibc_likely (fildes[1] != STDOUT_FILENO))
 	{
-	  __dup2 (fd, STDERR_FILENO);
-	  __close (fd);
+	  if (__posix_spawn_file_actions_adddup2 (&fa, fildes[1],
+						  STDOUT_FILENO) != 0
+	      || __posix_spawn_file_actions_addclose (&fa, fildes[1]) != 0)
+	    goto out;
 	}
-      /* Be paranoid.  Check that we actually opened the /dev/null
-	 device.  */
-      if (__builtin_expect (__fxstat64 (_STAT_VER, STDERR_FILENO, &st), 0) != 0
-	  || __builtin_expect (S_ISCHR (st.st_mode), 1) == 0
-#if defined DEV_NULL_MAJOR && defined DEV_NULL_MINOR
-	  || st.st_rdev != __gnu_dev_makedev (DEV_NULL_MAJOR, DEV_NULL_MINOR)
-#endif
-	  )
-	/* It's not the /dev/null device.  Stop right here.  The
-	   problem is: how do we stop?  We use _exit() with an
-	   hopefully unusual exit code.  */
-	_exit (90);
+      else
+	/* Reset the close-on-exec flag (if necessary).  */
+	if (__posix_spawn_file_actions_adddup2 (&fa, fildes[1], fildes[1])
+	    != 0)
+	  goto out;
     }
 
-  /* Make sure the subshell doesn't field-split on our behalf. */
-  __unsetenv ("IFS");
+  /* Redirect stderr to /dev/null if we have to.  */
+  if (!showerr)
+    if (__posix_spawn_file_actions_addopen (&fa, STDERR_FILENO, _PATH_DEVNULL,
+					    O_WRONLY, 0) != 0)
+      goto out;
+
+  char **newenv = create_environment ();
+  if (newenv == NULL)
+    goto out;
 
-  __close (fildes[0]);
-  __execve (_PATH_BSHELL, (char *const *) args, __environ);
+  /* pid is unset if posix_spawn fails, so it keep the original value
+     of -1.  */
+  __posix_spawn (&pid, _PATH_BSHELL, &fa, NULL, (char *const *) args, newenv);
 
-  /* Bad.  What now?  */
-  abort ();
+  free_environment (newenv);
+
+out:
+  __posix_spawn_file_actions_destroy (&fa);
+
+  return pid;
 }
 
 /* Function to execute a command and retrieve the results */
@@ -884,13 +898,13 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
   size_t maxnewlines = 0;
   char buffer[bufsize];
   pid_t pid;
-  int noexec = 0;
+  bool noexec = false;
 
   /* Do nothing if command substitution should not succeed.  */
   if (flags & WRDE_NOCMD)
     return WRDE_CMDSUB;
 
-  /* Don't fork() unless necessary */
+  /* Don't posix_spawn() unless necessary */
   if (!comm || !*comm)
     return 0;
 
@@ -898,19 +912,15 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
     return WRDE_NOSPACE;
 
  again:
-  if ((pid = __fork ()) < 0)
+  pid = exec_comm_child (comm, fildes, noexec ? false : flags & WRDE_SHOWERR,
+			 noexec);
+  if (pid < 0)
     {
-      /* Bad */
       __close (fildes[0]);
       __close (fildes[1]);
       return WRDE_NOSPACE;
     }
 
-  if (pid == 0)
-    exec_comm_child (comm, fildes, noexec ? 0 : flags & WRDE_SHOWERR, noexec);
-
-  /* Parent */
-
   /* If we are just testing the syntax, only wait.  */
   if (noexec)
     return (TEMP_FAILURE_RETRY (__waitpid (pid, &status, 0)) == pid
@@ -1091,7 +1101,7 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
   /* Check for syntax error (re-execute but with "-n" flag) */
   if (buflen < 1 && status != 0)
     {
-      noexec = 1;
+      noexec = true;
       goto again;
     }
 
@@ -1143,26 +1153,9 @@ parse_comm (char **word, size_t *word_length, size_t *max_length,
 	      /* Go -- give script to the shell */
 	      if (comm)
 		{
-#ifdef __libc_ptf_call
-		  /* We do not want the exec_comm call to be cut short
-		     by a thread cancellation since cleanup is very
-		     ugly.  Therefore disable cancellation for
-		     now.  */
-		  // XXX Ideally we do want the thread being cancelable.
-		  // XXX If demand is there we'll change it.
-		  int state = PTHREAD_CANCEL_ENABLE;
-		  __libc_ptf_call (__pthread_setcancelstate,
-				   (PTHREAD_CANCEL_DISABLE, &state), 0);
-#endif
-
+		  /* posix_spawn already handles thread cancellation.  */
 		  error = exec_comm (comm, word, word_length, max_length,
 				     flags, pwordexp, ifs, ifs_white);
-
-#ifdef __libc_ptf_call
-		  __libc_ptf_call (__pthread_setcancelstate,
-				   (state, NULL), 0);
-#endif
-
 		  free (comm);
 		}
 
-- 
2.17.1

^ permalink raw reply	[flat|nested] 59+ messages in thread

* [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
@ 2019-07-31 18:31 Adhemerval Zanella
  2019-07-31 18:31 ` [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls Adhemerval Zanella
                   ` (7 more replies)
  0 siblings, 8 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-07-31 18:31 UTC (permalink / raw)
  To: libc-alpha

This patch changes how the fallback getdents64 implementation calls
non-LFS getdents by replacing the scratch_buffer with static buffer
plus a loop on getdents calls.  This avoids the potential malloc
call on scratch_buffer_set_array_size for large input buffer size
at the cost of more getdents syscalls.

It also adds a small optimization for older kernels, where the first
ENOSYS failure for getdents64 disable subsequent calls.

Check the dirent tests on a mips64-linux-gnu with getdents64 code
disabled.

	* sysdeps/unix/sysv/linux/mips/mips64/getdents64.c (__getdents64):
	Add small optimization for older kernel to avoid issuing
	__NR_getdents64 on each call and replace scratch_buffer usage with
	a static allocated buffer.
---
 .../unix/sysv/linux/mips/mips64/getdents64.c  | 122 ++++++++----------
 1 file changed, 54 insertions(+), 68 deletions(-)

diff --git a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
index 8bf3abb0e0..3b5afd9324 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
+++ b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
@@ -22,98 +22,84 @@
 #include <assert.h>
 #include <sys/param.h>
 #include <unistd.h>
-#include <scratch_buffer.h>
 #include <limits.h>
 
 ssize_t
-__getdents64 (int fd, void *buf0, size_t nbytes)
+__getdents64 (int fd, void *buf, size_t nbytes)
 {
-  char *buf = buf0;
-
   /* The system call takes an unsigned int argument, and some length
      checks in the kernel use an int type.  */
   if (nbytes > INT_MAX)
     nbytes = INT_MAX;
 
 #ifdef __NR_getdents64
-  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
-  if (ret != -1)
-    return ret;
+  static bool getdents64_supportted = true;
+  if (atomic_load_relaxed (&getdents64_supportted))
+    {
+      ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
+      if (ret >= 0 || errno != ENOSYS)
+	return ret;
+
+      atomic_store_relaxed (&getdents64_supportted, false);
+    }
 #endif
 
   /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.
-     If syscall is not available it need to fallback to non-LFS one.  */
+     If the syscall is not available it need to fallback to the non-LFS one.
+     Also to avoid an unbounded allocation through VLA/alloca or malloc (which
+     would make the syscall non async-signal-safe) it uses a limited buffer.
+     This is sub-optimal for large NBYTES, however this is a fallback
+     mechanism to emulate a syscall that kernel should provide.   */
 
+  enum { KBUF_SIZE = 1024 };
   struct kernel_dirent
-    {
-      unsigned long d_ino;
-      unsigned long d_off;
-      unsigned short int d_reclen;
-      char d_name[256];
-    };
-
-  const size_t size_diff = (offsetof (struct dirent64, d_name)
-			   - offsetof (struct kernel_dirent, d_name));
-
-  size_t red_nbytes = MIN (nbytes
-			   - ((nbytes / (offsetof (struct dirent64, d_name)
-					 + 14)) * size_diff),
-			   nbytes - size_diff);
-
-  struct scratch_buffer tmpbuf;
-  scratch_buffer_init (&tmpbuf);
-  if (!scratch_buffer_set_array_size (&tmpbuf, red_nbytes, sizeof (uint8_t)))
-    INLINE_SYSCALL_ERROR_RETURN_VALUE (ENOMEM);
-
-  struct kernel_dirent *skdp, *kdp;
-  skdp = kdp = tmpbuf.data;
-
-  ssize_t retval = INLINE_SYSCALL_CALL (getdents, fd, kdp, red_nbytes);
-  if (retval == -1)
-    {
-      scratch_buffer_free (&tmpbuf);
-      return -1;
-    }
+  {
+    unsigned long d_ino;
+    unsigned long d_off;
+    unsigned short int d_reclen;
+    char d_name[1];
+  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
+  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
 
-  off64_t last_offset = -1;
   struct dirent64 *dp = (struct dirent64 *) buf;
-  while ((char *) kdp < (char *) skdp + retval)
+
+  size_t nb = 0;
+  off64_t last_offset = -1;
+
+  ssize_t r;
+  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)
     {
-      const size_t alignment = _Alignof (struct dirent64);
-      /* Since kdp->d_reclen is already aligned for the kernel structure
-	 this may compute a value that is bigger than necessary.  */
-      size_t new_reclen = ((kdp->d_reclen + size_diff + alignment - 1)
-			   & ~(alignment - 1));
-      if ((char *) dp + new_reclen > buf + nbytes)
-        {
-	  /* Our heuristic failed.  We read too many entries.  Reset
-	     the stream.  */
-	  assert (last_offset != -1);
-	  __lseek64 (fd, last_offset, SEEK_SET);
-
-	  if ((char *) dp == buf)
+      struct kernel_dirent *skdp, *kdp;
+      skdp = kdp = kbuf;
+
+      while ((char *) kdp < (char *) skdp + r)
+	{
+	  const size_t alignment = _Alignof (struct dirent64);
+	  size_t new_reclen = ((kdp->d_reclen + alignment - 1)
+			      & ~(alignment - 1));
+	  if (nb + new_reclen > nbytes)
 	    {
-	      scratch_buffer_free (&tmpbuf);
-	      return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
+		/* The new entry will overflow the input buffer, rewind to
+		   last obtained entry and return.  */
+	       __lseek64 (fd, last_offset, SEEK_SET);
+	       goto out;
 	    }
+	  nb += new_reclen;
 
-	  break;
-	}
-
-      last_offset = kdp->d_off;
-      dp->d_ino = kdp->d_ino;
-      dp->d_off = kdp->d_off;
-      dp->d_reclen = new_reclen;
-      dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
-      memcpy (dp->d_name, kdp->d_name,
-	      kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
+	  dp->d_ino = kdp->d_ino;
+	  dp->d_off = last_offset = kdp->d_off;
+	  dp->d_reclen = new_reclen;
+	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
+	  memcpy (dp->d_name, kdp->d_name,
+		  kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
 
-      dp = (struct dirent64 *) ((char *) dp + new_reclen);
-      kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
+	  dp = (struct dirent64 *) ((char *) dp + new_reclen);
+	  kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
+	}
     }
 
-  scratch_buffer_free (&tmpbuf);
-  return (char *) dp - buf;
+out:
+  return (char *) dp - (char *) buf;
 }
 libc_hidden_def (__getdents64)
 weak_alias (__getdents64, getdents64)
-- 
2.17.1

^ permalink raw reply	[flat|nested] 59+ messages in thread

* [PATCH v2 3/5] posix: Optimize stack Linux posix_spawn
  2019-07-31 18:31 [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback Adhemerval Zanella
  2019-07-31 18:31 ` [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls Adhemerval Zanella
@ 2019-07-31 18:31 ` Adhemerval Zanella
  2019-08-28 14:09   ` Adhemerval Zanella
  2019-07-31 18:31 ` [PATCH v2 2/5] posix: Add posix_spawn_file_actions_closefrom Adhemerval Zanella
                   ` (5 subsequent siblings)
  7 siblings, 1 reply; 59+ messages in thread
From: Adhemerval Zanella @ 2019-07-31 18:31 UTC (permalink / raw)
  To: libc-alpha

Changes from previous version:

  * Move the logic of stack mapping creation to stackmap.h and
    added a guard page allocation for the compatibility case.

--

The current internal posix_spawn symbol for Linux (__spawni) requires
to allocate a dynamic stack based on input arguments to handle the
SPAWN_XFLAGS_USE_PATH internal flag, which re-issue the input binary
as a shell script if execve call return ENOEXEC (to execute shell
scripts with an initial shebang).

This is done only for compatibility mode and the generic case does not
require the extra calculation plus the potential large mmap/munmap
call.  For default case, a pre-defined buffer is sufficed to use on the
clone call instead.

This patch optimizes Linux spawni by allocating a dynamic stack only
for compatibility symbol (SPAWN_XFLAGS_USE_PATH).  For generic case,
an mmap allocated buffer is used along with a guard page, similar to
what NPTL uses for thread stacks hardening.

For default case, it is a fixed code path with fixed stack usage in helper
process, so assuming a large enough stack buffer it would never overflow.
It also does not prevent to adapt to the vfork-like to re-use process
stack, once it is implemented.

Checked x86_64-linux-gnu and i686-linux-gnu.

	* sysdeps/unix/sysv/linux/spawni.c (posix_spawn_args): Remove
	argc member.
	(maybe_script_execute): Remove function.
	(execve_compat, __spawni_clone, __spawnix_compat): New function.
	(__spawni_child): Remove maybe_script_execute call.
	(__spawnix): Remove magic stack slack constant with stack_slack
	identifier.
	(__spawni): Only allocates a variable stack when
	SPAWN_XFLAGS_TRY_SHELL is used.
	* posix/stackmap.h: New file.
	* sysdeps/ia64/nptl/pthreaddef.h (NEED_SEPARATE_REGISTER_STACK): Move
	to ...
	* sysdeps/ia64/stackinfo.h: ... here.
---
 posix/stackmap.h                 | 115 +++++++++++++
 sysdeps/ia64/nptl/pthreaddef.h   |   3 -
 sysdeps/ia64/stackinfo.h         |   3 +
 sysdeps/unix/sysv/linux/spawni.c | 277 +++++++++++++++++++------------
 4 files changed, 285 insertions(+), 113 deletions(-)
 create mode 100644 posix/stackmap.h

diff --git a/posix/stackmap.h b/posix/stackmap.h
new file mode 100644
index 0000000000..be500e378a
--- /dev/null
+++ b/posix/stackmap.h
@@ -0,0 +1,115 @@
+/* Functions to create stack mappings for helper processes.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _STACKMAP_H
+#define _STACKMAP_H
+
+#include <unistd.h>
+#include <sys/mman.h>
+#include <ldsodefs.h>
+#include <stdbool.h>
+
+static inline int
+stack_prot (void)
+{
+  return (PROT_READ | PROT_WRITE
+	  | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
+}
+
+static inline size_t
+stack_guard_size (void)
+{
+ return GLRO (dl_pagesize);
+}
+
+/* Return a aligning mask based on system pagesize.  */
+static inline size_t
+stack_pagesize_m1_mask (void)
+{
+  size_t pagesize_m1 = __getpagesize () - 1;
+  return ~pagesize_m1;
+}
+
+/* Return the guard page position on memory segment MEM with total size SIZE
+   and with a guard page of size GUARDIZE.  */
+static inline void *
+stack_guard_position (void *mem, size_t size, size_t guardsize)
+{
+#ifdef NEED_SEPARATE_REGISTER_STACK
+  return mem + (((size - guardsize) / 2) & stack_pagesize_m1_mask ());
+#elif _STACK_GROWS_DOWN
+  return mem;
+#elif _STACK_GROWS_UP
+  return (void *) (((uintptr_t)(mem + size)- guardsize)
+		   & stack_pagesize_m1_mask ());
+#endif
+}
+
+/* Setup the expected stack memory protection value (based on stack_prot)
+   for the memory segment MEM with size SIZE based on the guard page
+   GUARD with size GUARDSIZE.  The memory segment is expected to be allocated
+   with PROT_NOTE.  */
+static inline bool
+stack_setup_prot (char *mem, size_t size, char *guard, size_t guardsize)
+{
+  const int prot = stack_prot ();
+
+  char *guardend = guard + guardsize;
+#if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
+  /* As defined at guard_position, for architectures with downward stack
+     the guard page is always at start of the allocated area.  */
+  if (__mprotect (guardend, size - guardsize, prot) != 0)
+    return false;
+#else
+  size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
+  if (__mprotect (mem, mprots1, prot) != 0)
+    return false;
+  size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
+  if (__mprotect (guardend, mprots2, prot) != 0)
+    return false;
+#endif
+  return true;
+}
+
+/* Allocated a memory segment with size SIZE plus GUARSIZE with mmap and
+   setup the expected protection for both a guard page and the stack
+   itself.  */
+static inline void *
+stack_allocate (size_t size, size_t guardsize)
+{
+  const int prot = stack_prot ();
+
+  /* If a guard page is required, avoid committing memory by first
+     allocate with PROT_NONE and then reserve with required permission
+     excluding the guard page.  */
+  void *mem = __mmap (NULL, size, (guardsize == 0) ? prot : PROT_NONE,
+		      MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
+  if (guardsize)
+    {
+      void *guard = stack_guard_position (mem, size, guardsize);
+      if (!stack_setup_prot (mem, size, guard, guardsize))
+	{
+	  __munmap (mem, size);
+	  return MAP_FAILED;
+	}
+    }
+
+  return mem;
+}
+
+#endif /* _STACKMAP_H  */
diff --git a/sysdeps/ia64/nptl/pthreaddef.h b/sysdeps/ia64/nptl/pthreaddef.h
index bf52d5af62..11579f11b4 100644
--- a/sysdeps/ia64/nptl/pthreaddef.h
+++ b/sysdeps/ia64/nptl/pthreaddef.h
@@ -18,9 +18,6 @@
 /* Default stack size.  */
 #define ARCH_STACK_DEFAULT_SIZE	(32 * 1024 * 1024)
 
-/* IA-64 uses a normal stack and a register stack.  */
-#define NEED_SEPARATE_REGISTER_STACK
-
 /* Required stack pointer alignment at beginning.  */
 #define STACK_ALIGN		16
 
diff --git a/sysdeps/ia64/stackinfo.h b/sysdeps/ia64/stackinfo.h
index 6433a89945..d942426fcf 100644
--- a/sysdeps/ia64/stackinfo.h
+++ b/sysdeps/ia64/stackinfo.h
@@ -30,4 +30,7 @@
 /* Default to a non-executable stack.  */
 #define DEFAULT_STACK_PERMS (PF_R|PF_W)
 
+/* IA-64 uses a normal stack and a register stack.  */
+#define NEED_SEPARATE_REGISTER_STACK
+
 #endif	/* stackinfo.h */
diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
index ca7bf99825..0f7a8ca5df 100644
--- a/sysdeps/unix/sysv/linux/spawni.c
+++ b/sysdeps/unix/sysv/linux/spawni.c
@@ -23,10 +23,11 @@
 #include <not-cancel.h>
 #include <local-setxid.h>
 #include <shlib-compat.h>
-#include <sigsetops.h>
-#include <internal-signals.h>
-#include <ldsodefs.h>
+#include <nptl/pthreadP.h>
 #include <ctype.h>
+#include <dl-sysdep.h>
+#include <libc-pointer-arith.h>
+#include <stackmap.h>
 #include "spawn_int.h"
 
 /* The Linux implementation of posix_spawn{p} uses the clone syscall directly
@@ -70,7 +71,6 @@
 # define STACK(__stack, __stack_size) (__stack + __stack_size)
 #endif
 
-
 struct posix_spawn_args
 {
   sigset_t oldmask;
@@ -79,37 +79,11 @@ struct posix_spawn_args
   const posix_spawn_file_actions_t *fa;
   const posix_spawnattr_t *restrict attr;
   char *const *argv;
-  ptrdiff_t argc;
   char *const *envp;
   int xflags;
   int err;
 };
 
-/* Older version requires that shell script without shebang definition
-   to be called explicitly using /bin/sh (_PATH_BSHELL).  */
-static void
-maybe_script_execute (struct posix_spawn_args *args)
-{
-  if (SHLIB_COMPAT (libc, GLIBC_2_2, GLIBC_2_15)
-      && (args->xflags & SPAWN_XFLAGS_TRY_SHELL) && errno == ENOEXEC)
-    {
-      char *const *argv = args->argv;
-      ptrdiff_t argc = args->argc;
-
-      /* Construct an argument list for the shell.  */
-      char *new_argv[argc + 2];
-      new_argv[0] = (char *) _PATH_BSHELL;
-      new_argv[1] = (char *) args->file;
-      if (argc > 1)
-	memcpy (new_argv + 2, argv + 1, argc * sizeof (char *));
-      else
-	new_argv[2] = NULL;
-
-      /* Execute the shell.  */
-      args->exec (new_argv[0], new_argv, args->envp);
-    }
-}
-
 /* Close all file descriptor up to FROM by interacting /proc/self/fd.  */
 static bool
 spawn_closefrom (int from)
@@ -152,7 +126,7 @@ spawn_closefrom (int from)
    attributes, and file actions.  It run on its own stack (provided by the
    posix_spawn call).  */
 static int
-__spawni_child (void *arguments)
+spawni_child (void *arguments)
 {
   struct posix_spawn_args *args = arguments;
   const posix_spawnattr_t *restrict attr = args->attr;
@@ -330,11 +304,6 @@ __spawni_child (void *arguments)
 
   args->exec (args->file, args->argv, args->envp);
 
-  /* This is compatibility function required to enable posix_spawn run
-     script without shebang definition for older posix_spawn versions
-     (2.15).  */
-  maybe_script_execute (args);
-
 fail:
   /* errno should have an appropriate non-zero value; otherwise,
      there's a bug in glibc or the kernel.  For lack of an error code
@@ -345,71 +314,12 @@ fail:
   _exit (SPAWN_ERROR);
 }
 
-/* Spawn a new process executing PATH with the attributes describes in *ATTRP.
-   Before running the process perform the actions described in FILE-ACTIONS. */
 static int
-__spawnix (pid_t * pid, const char *file,
-	   const posix_spawn_file_actions_t * file_actions,
-	   const posix_spawnattr_t * attrp, char *const argv[],
-	   char *const envp[], int xflags,
-	   int (*exec) (const char *, char *const *, char *const *))
+spawni_clone (struct posix_spawn_args *args, void *stack, size_t stack_size,
+	      pid_t *pid)
 {
-  pid_t new_pid;
-  struct posix_spawn_args args;
   int ec;
-
-  /* To avoid imposing hard limits on posix_spawn{p} the total number of
-     arguments is first calculated to allocate a mmap to hold all possible
-     values.  */
-  ptrdiff_t argc = 0;
-  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
-     to be used in a execve call.  We limit to INT_MAX minus one due the
-     compatiblity code that may execute a shell script (maybe_script_execute)
-     where it will construct another argument list with an additional
-     argument.  */
-  ptrdiff_t limit = INT_MAX - 1;
-  while (argv[argc++] != NULL)
-    if (argc == limit)
-      {
-	errno = E2BIG;
-	return errno;
-      }
-
-  int prot = (PROT_READ | PROT_WRITE
-	     | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
-
-  size_t argv_size = (argc * sizeof (void *));
-  /* We need at least a few pages in case the compiler's stack checking is
-     enabled.  In some configs, it is known to use at least 24KiB.  We use
-     32KiB to be "safe" from anything the compiler might do.  Besides, the
-     extra pages won't actually be allocated unless they get used.
-     It also acts the slack for spawn_closefrom (including MIPS64 getdents64
-     where it might use about 1k extra stack space.  */
-  argv_size += (32 * 1024);
-  size_t stack_size = ALIGN_UP (argv_size, GLRO(dl_pagesize));
-  void *stack = __mmap (NULL, stack_size, prot,
-			MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
-  if (__glibc_unlikely (stack == MAP_FAILED))
-    return errno;
-
-  /* Disable asynchronous cancellation.  */
-  int state;
-  __libc_ptf_call (__pthread_setcancelstate,
-                   (PTHREAD_CANCEL_DISABLE, &state), 0);
-
-  /* Child must set args.err to something non-negative - we rely on
-     the parent and child sharing VM.  */
-  args.err = 0;
-  args.file = file;
-  args.exec = exec;
-  args.fa = file_actions;
-  args.attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 };
-  args.argv = argv;
-  args.argc = argc;
-  args.envp = envp;
-  args.xflags = xflags;
-
-  __libc_signal_block_all (&args.oldmask);
+  pid_t new_pid;
 
   /* The clone flags used will create a new child that will run in the same
      memory space (CLONE_VM) and the execution of calling thread will be
@@ -419,8 +329,8 @@ __spawnix (pid_t * pid, const char *file,
      need for CLONE_SETTLS.  Although parent and child share the same TLS
      namespace, there will be no concurrent access for TLS variables (errno
      for instance).  */
-  new_pid = CLONE (__spawni_child, STACK (stack, stack_size), stack_size,
-		   CLONE_VM | CLONE_VFORK | SIGCHLD, &args);
+  new_pid = CLONE (spawni_child, STACK (stack, stack_size), stack_size,
+		   CLONE_VM | CLONE_VFORK | SIGCHLD, args);
 
   /* It needs to collect the case where the auxiliary process was created
      but failed to execute the file (due either any preparation step or
@@ -433,7 +343,7 @@ __spawnix (pid_t * pid, const char *file,
 	 only in case of failure, so in case of premature termination
 	 due a signal args.err will remain zeroed and it will be up to
 	 caller to actually collect it.  */
-      ec = args.err;
+      ec = args->err;
       if (ec > 0)
 	/* There still an unlikely case where the child is cancelled after
 	   setting args.err, due to a positive error value.  Also there is
@@ -446,14 +356,139 @@ __spawnix (pid_t * pid, const char *file,
   else
     ec = -new_pid;
 
-  __munmap (stack, stack_size);
-
   if ((ec == 0) && (pid != NULL))
     *pid = new_pid;
 
-  __libc_signal_restore_set (&args.oldmask);
+  return ec;
+}
 
-  __libc_ptf_call (__pthread_setcancelstate, (state, NULL), 0);
+#if SHLIB_COMPAT (libc, GLIBC_2_2, GLIBC_2_15)
+/* This is compatibility function required to enable posix_spawn run
+   script without shebang definition for older posix_spawn versions
+   (2.15).  */
+static int
+execve_compat (const char *filename, char *const argv[], char *const envp[])
+{
+  __execve (filename, argv, envp);
+
+  if (errno == ENOEXEC)
+    {
+      char *const *cargv = argv;
+      ptrdiff_t argc = 0;
+      while (cargv[argc++] != NULL);
+
+      /* Construct an argument list for the shell.  */
+      char *new_argv[argc + 2];
+      new_argv[0] = (char *) _PATH_BSHELL;
+      new_argv[1] = (char *) filename;
+      if (argc > 1)
+	memcpy (new_argv + 2, argv + 1, argc * sizeof (char *));
+      else
+	new_argv[2] = NULL;
+
+      /* Execute the shell.  */
+      __execve (new_argv[0], new_argv, envp);
+    }
+
+  return -1;
+}
+
+/* Allocates a stack using mmap to call clone.  The stack size is based on
+   number of arguments since it would be used on compat mode which may call
+   execvpe/execve_compat.  */
+static int
+spawnix_compat (struct posix_spawn_args *args, pid_t *pid)
+{
+  char *const *argv = args->argv;
+
+  /* To avoid imposing hard limits on posix_spawn{p} the total number of
+     arguments is first calculated to allocate a mmap to hold all possible
+     values.  */
+  ptrdiff_t argc = 0;
+  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
+     to be used in a execve call.  We limit to INT_MAX minus one due the
+     compatiblity code that may execute a shell script (maybe_script_execute)
+     where it will construct another argument list with an additional
+     argument.  */
+  ptrdiff_t limit = INT_MAX - 1;
+  while (argv[argc++] != NULL)
+    if (argc == limit)
+      {
+	errno = E2BIG;
+	return errno;
+      }
+
+  size_t argv_size = (argc * sizeof (void *));
+  /* We need at least a few pages in case the compiler's stack checking is
+     enabled.  In some configs, it is known to use at least 24KiB.  We use
+     32KiB to be "safe" from anything the compiler might do.  Besides, the
+     extra pages won't actually be allocated unless they get used.
+     It also acts the slack for spawn_closefrom (including MIPS64 getdents64
+     where it might use about 1k extra stack space.  */
+  argv_size += (32 * 1024);
+
+  /* Allocate a stack with an extra guard page.  */
+  size_t guard_size = stack_guard_size ();
+  size_t stack_size = guard_size + ALIGN_UP (argv_size, __getpagesize ());
+  void *stack = stack_allocate (stack_size, guard_size);
+  if (__glibc_unlikely (stack == MAP_FAILED))
+    return errno;
+
+  int ec = spawni_clone (args, stack, stack_size, pid);
+
+  __munmap (stack, stack_size);
+
+  return ec;
+}
+#endif
+
+/* For SPAWN_XFLAGS_TRY_SHELL we need to execute a script even without
+   a shebang.  To accomplish it we pass as callback to spawni_child
+   __execvpe (which call maybe_script_execute for such case) or
+   execve_compat (which mimics the semantic using execve).  */
+static int
+spawn_process (struct posix_spawn_args *args, pid_t *pid)
+{
+  int ec;
+
+#if SHLIB_COMPAT (libc, GLIBC_2_2, GLIBC_2_15)
+  if (args->xflags & SPAWN_XFLAGS_TRY_SHELL)
+    {
+      args->exec = args->xflags & SPAWN_XFLAGS_USE_PATH
+		   ? __execvpe  : execve_compat;
+      ec = spawnix_compat (args, pid);
+    }
+  else
+#endif
+    {
+      args->exec = args->xflags & SPAWN_XFLAGS_USE_PATH
+		   ? __execvpex : __execve;
+
+      /* spawni_clone stack usage need to take in consideration spawni_child
+	 stack usage and subsequent functions called:
+
+	 - sigprocmask: might allocate an extra sigset_t (128 bytes).
+	 - __libc_sigaction: allocate a struct kernel_sigaction (144 bytes on
+	   64-bit, 136 on 32-bit).
+	 - __sched_setparam, __sched_setscheduler, __setsig, __setpgid,
+	   local_seteuid, local_setegid, __close_nocancel, __getrlimit64,
+	   __close_nocancel, __open_nocancel, __dup2, __chdir, __fchdir:
+	   and direct syscall.
+	 - __fcntl: wrapper only uses local variables.
+	 - spawn_closefrom: uses up to 1024 bytes as local buffer
+	   - __direntries_read
+	     - __getdents64: MIPS64 uses up to buffer size used, 1024 in this
+	       specific usage.
+	   - __direntries_next: local variables.
+	   - __close_nocancel: direct syscall.
+         - execvpe allocates at least (NAME_MAX + 1) + PATH_MAX to create the
+	   combination of PATH entry and program name (1024 + 255 + 1).
+
+	 It allocates 2048 plus some stack for automatic variables and function
+	 calls.  */
+      char stack[2560];
+      ec = spawni_clone (args, stack, sizeof stack, pid);
+    }
 
   return ec;
 }
@@ -462,12 +497,34 @@ __spawnix (pid_t * pid, const char *file,
    Before running the process perform the actions described in FILE-ACTIONS. */
 int
 __spawni (pid_t * pid, const char *file,
-	  const posix_spawn_file_actions_t * acts,
+	  const posix_spawn_file_actions_t * file_actions,
 	  const posix_spawnattr_t * attrp, char *const argv[],
 	  char *const envp[], int xflags)
 {
-  /* It uses __execvpex to avoid run ENOEXEC in non compatibility mode (it
-     will be handled by maybe_script_execute).  */
-  return __spawnix (pid, file, acts, attrp, argv, envp, xflags,
-		    xflags & SPAWN_XFLAGS_USE_PATH ? __execvpex :__execve);
+  /* Child must set args.err to something non-negative - we rely on
+     the parent and child sharing VM.  */
+  struct posix_spawn_args args = {
+    .err = 0,
+    .file = file,
+    .fa = file_actions,
+    .attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 },
+    .argv = argv,
+    .envp = envp,
+    .xflags = xflags
+  };
+
+  /* Disable asynchronous cancellation.  */
+  int state;
+  __libc_ptf_call (__pthread_setcancelstate,
+                   (PTHREAD_CANCEL_DISABLE, &state), 0);
+
+  __libc_signal_block_all (&args.oldmask);
+
+  int ec = spawn_process (&args, pid);
+
+  __libc_signal_restore_set (&args.oldmask);
+
+  __libc_ptf_call (__pthread_setcancelstate, (state, NULL), 0);
+
+  return ec;
 }
-- 
2.17.1

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-07-31 18:31 ` [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls Adhemerval Zanella
@ 2019-08-28 14:09   ` Adhemerval Zanella
  2019-08-29  8:38   ` Florian Weimer
  1 sibling, 0 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-08-28 14:09 UTC (permalink / raw)
  To: libc-alpha

Ping.

On 31/07/2019 15:31, Adhemerval Zanella wrote:
> The child helper process on Linux posix_spawn child must ensure that no signal
> handler are enabled, so the signal disposition must be either SIG_DFL or
> SIG_IGN.  However, it requires a sigprocmask to obtain the current signal mask
> and at least _NSIG sigaction calls to reset the signal handlers for each
> posix_spawn call.
> 
> This patch optimizes it by tracking on sigaction implementation when a signal
> action is set different than SIG_DFL or SIG_IGN.  It allows remove a
> sigprocmask call and isse sigaction to reset the disposition only the signals
> that has non-default actions set.
> 
> It might incur in false positive, since it not easy to remove bits from the
> mask without race conditions, but it does not allow false negative since the
> mask is updated atomically prior the syscall.  The false positive incur in
> just extra sigactions on posix_spawn.
> 
> Checked on x86_64 and i686.
> 
> 	* include/atomic.h (atomic_fetch_or_seq_cst, atomic_fetch_or_seq_cst):
> 	New macros.
> 	* posix/Makefile (tests): Add tst-spawn6.
> 	* posix/tst-spawn6.c: New file.
> 	* sysdeps/generic/sigsetops.h (__sigorset_atomic): New macro.
> 	* sysdeps/unix/sysv/linux/internal-signals.h (__get_sighandler_set):
> 	New prototype.
> 	* sysdeps/unix/sysv/linux/sigaction.c (__get_sighandler_set): New
> 	function.
> 	(__libc_sigaction): Set the internal handler_set for a new action.
> 	* sysdeps/unix/sysv/linux/sigsetops.h (__sigorset_atomic,
> 	__sigaddset_atomic): New macros.
> 	* sysdeps/unix/sysv/linux/spawni.c (spawni_child): Replace
> 	__sigprocmask with __get_sighandler_set.
> ---
>  include/atomic.h                           |  10 +
>  posix/Makefile                             |   4 +-
>  posix/tst-spawn6.c                         | 220 +++++++++++++++++++++
>  sysdeps/generic/sigsetops.h                |   7 +
>  sysdeps/unix/sysv/linux/internal-signals.h |   3 +
>  sysdeps/unix/sysv/linux/sigaction.c        |  17 ++
>  sysdeps/unix/sysv/linux/sigsetops.h        |  15 ++
>  sysdeps/unix/sysv/linux/spawni.c           |   9 +-
>  8 files changed, 278 insertions(+), 7 deletions(-)
>  create mode 100644 posix/tst-spawn6.c
> 
> diff --git a/include/atomic.h b/include/atomic.h
> index ee1978eb3b..72609efde9 100644
> --- a/include/atomic.h
> +++ b/include/atomic.h
> @@ -646,6 +646,9 @@ void __atomic_link_error (void);
>  # define atomic_fetch_or_release(mem, operand) \
>    ({ __atomic_check_size((mem));					      \
>    __atomic_fetch_or ((mem), (operand), __ATOMIC_RELEASE); })
> +# define atomic_fetch_or_seq_cst(mem, operand) \
> +  ({ __atomic_check_size((mem));					      \
> +  __atomic_fetch_or ((mem), (operand), __ATOMIC_SEQ_CST); })
>  
>  # define atomic_fetch_xor_release(mem, operand) \
>    ({ __atomic_check_size((mem));					      \
> @@ -791,6 +794,13 @@ void __atomic_link_error (void);
>     ({ atomic_thread_fence_release ();					      \
>     atomic_fetch_or_acquire ((mem), (operand)); })
>  # endif
> +# ifndef atomic_fetch_or_seq_cst
> +#  define atomic_fetch_or_seq_cst(mem, operand) \
> +   ({ atomic_thread_fence_acquire ();					      \
> +   atomic_fetch_or_relaxed ((mem), (operand));				      \
> +   atomic_thread_fence_release (); })
> +# endif
> +
>  
>  # ifndef atomic_fetch_xor_release
>  /* Failing the atomic_compare_exchange_weak_release reloads the value in
> diff --git a/posix/Makefile b/posix/Makefile
> index 1ac41ad85a..131ae052fd 100644
> --- a/posix/Makefile
> +++ b/posix/Makefile
> @@ -102,7 +102,8 @@ tests		:= test-errno tstgetopt testfnm runtests runptests \
>  		   tst-sysconf-empty-chroot tst-glob_symlinks tst-fexecve \
>  		   tst-glob-tilde test-ssize-max tst-spawn4 bug-regex37 \
>  		   bug-regex38 tst-regcomp-truncated tst-spawn-chdir \
> -		   tst-spawn5
> +		   tst-spawn5 \
> +		   tst-spawn6
>  tests-internal	:= bug-regex5 bug-regex20 bug-regex33 \
>  		   tst-rfc3484 tst-rfc3484-2 tst-rfc3484-3 \
>  		   tst-glob_lstat_compat tst-spawn4-compat
> @@ -255,6 +256,7 @@ tst-exec-ARGS = -- $(host-test-program-cmd)
>  tst-exec-static-ARGS = $(tst-exec-ARGS)
>  tst-execvpe5-ARGS = -- $(host-test-program-cmd)
>  tst-spawn-ARGS = -- $(host-test-program-cmd)
> +tst-spawn6-ARGS = -- $(host-test-program-cmd)
>  tst-spawn-static-ARGS = $(tst-spawn-ARGS)
>  tst-spawn5-ARGS = -- $(host-test-program-cmd)
>  tst-dir-ARGS = `pwd` `cd $(common-objdir)/$(subdir); pwd` `cd $(common-objdir); pwd` $(objpfx)tst-dir
> diff --git a/posix/tst-spawn6.c b/posix/tst-spawn6.c
> new file mode 100644
> index 0000000000..466e66f104
> --- /dev/null
> +++ b/posix/tst-spawn6.c
> @@ -0,0 +1,220 @@
> +/* Tests for posix_spawn signal handling.
> +   Copyright (C) 2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <getopt.h>
> +#include <spawn.h>
> +#include <sys/wait.h>
> +
> +#include <support/check.h>
> +#include <support/xunistd.h>
> +#include <support/support.h>
> +#include <array_length.h>
> +
> +/* Nonzero if the program gets called via `exec'.  */
> +static int restart;
> +#define CMDLINE_OPTIONS \
> +  { "restart", no_argument, &restart, 1 },
> +
> +enum spawn_test_t
> +{
> +  SPAWN_SETSIGMASK,
> +  SPAWN_SETSIGDEF
> +};
> +
> +static int signal_to_check[] =
> +{
> +  SIGHUP, SIGINT, SIGALRM, SIGUSR2
> +};
> +
> +/* Called on process re-execution.  */
> +static int
> +handle_restart (enum spawn_test_t test)
> +{
> +  switch (test)
> +    {
> +    case SPAWN_SETSIGMASK:
> +      {
> +	sigset_t mask;
> +	sigprocmask (SIG_BLOCK, NULL, &mask);
> +	for (int i = 0; i < array_length (signal_to_check); i++)
> +	  if (sigismember (&mask, signal_to_check[i]) != 1)
> +	    exit (EXIT_FAILURE);
> +      }
> +      break;
> +    case SPAWN_SETSIGDEF:
> +      {
> +	for (int i = 0; i < array_length (signal_to_check); i++)
> +	  {
> +	    struct sigaction act;
> +	    if (sigaction (signal_to_check[i], NULL, &act) != 0)
> +	      exit (EXIT_FAILURE);
> +	    if (act.sa_handler != SIG_DFL)
> +	      exit (EXIT_FAILURE);
> +	  }
> +      }
> +      break;
> +    }
> +
> +  return 0;
> +}
> +
> +/* Common argument used for process re-execution.  */
> +static char *initial_spargv[5];
> +static size_t initial_spargv_size;
> +
> +/* Re-execute the test process with both '--direct', '--restart', and the
> +   TEST (as integer value) as arguments.  */
> +static void
> +reexecute (enum spawn_test_t test, const posix_spawnattr_t *attrp)
> +{
> +  char *spargv[8];
> +  int i;
> +
> +  for (i = 0; i < initial_spargv_size; i++)
> +    spargv[i] = initial_spargv[i];
> +  /* Three digits per byte plus null terminator.  */
> +  char teststr[3 * sizeof (test) + 1];
> +  snprintf (teststr, array_length (teststr), "%d", test);
> +  spargv[i++] = teststr;
> +  spargv[i] = NULL;
> +  TEST_VERIFY (i < 8);
> +
> +  pid_t pid;
> +  int status;
> +
> +  TEST_COMPARE (posix_spawn (&pid, spargv[0], NULL, attrp, spargv, environ),
> +		0);
> +  TEST_COMPARE (xwaitpid (pid, &status, 0), pid);
> +  TEST_VERIFY (WIFEXITED (status));
> +  TEST_VERIFY (!WIFSIGNALED (status));
> +  TEST_COMPARE (WEXITSTATUS (status), 0);
> +}
> +
> +/* Test if POSIX_SPAWN_SETSIGMASK change the spawn process signal mask to
> +   the value blocked signals defined by SIGNAL_TO_CHECK signals.  */
> +static void
> +do_test_setsigmask (void)
> +{
> +  posix_spawnattr_t attr;
> +  /* posix_spawnattr_init does not fail.  */
> +  posix_spawnattr_init (&attr);
> +
> +  {
> +    sigset_t mask;
> +    TEST_COMPARE (sigemptyset (&mask), 0);
> +    for (int i = 0; i < array_length (signal_to_check); i++)
> +      TEST_COMPARE (sigaddset (&mask, signal_to_check[i]), 0);
> +    TEST_COMPARE (posix_spawnattr_setsigmask (&attr, &mask), 0);
> +    TEST_COMPARE (posix_spawnattr_setflags (&attr, POSIX_SPAWN_SETSIGMASK), 0);
> +  }
> +
> +  /* Change current mask to be different than the one asked for spawned
> +     process.  */
> +  {
> +    sigset_t empty_mask, current_mask;
> +    TEST_COMPARE (sigemptyset (&empty_mask), 0);
> +    TEST_COMPARE (sigprocmask (SIG_BLOCK, &empty_mask, &current_mask), 0);
> +
> +    reexecute (SPAWN_SETSIGMASK, &attr);
> +
> +    TEST_COMPARE (sigprocmask (SIG_SETMASK, &current_mask, NULL), 0);
> +  }
> +}
> +
> +/* Test if POSIX_SPAWN_SETSIGDEF change the spawn process signal actions
> +   defined by SIGNAL_TO_CHECK signals to default actions.  */
> +static void
> +do_test_setsigdef (void)
> +{
> +  posix_spawnattr_t attr;
> +  /* posix_spawnattr_init does not fail.  */
> +  posix_spawnattr_init (&attr);
> +
> +  {
> +    sigset_t mask;
> +    TEST_COMPARE (sigemptyset (&mask), 0);
> +    for (int i = 0; i < array_length (signal_to_check); i++)
> +      TEST_COMPARE (sigaddset (&mask, signal_to_check[i]), 0);
> +    TEST_COMPARE (posix_spawnattr_setsigdefault (&attr, &mask), 0);
> +    TEST_COMPARE (posix_spawnattr_setflags (&attr, POSIX_SPAWN_SETSIGDEF), 0);
> +  }
> +
> +  /* Change current signal disposition to be different than the one asked for
> +     spawned process.  */
> +  struct sigaction default_act[array_length (signal_to_check)];
> +  {
> +    sigset_t empty_mask;
> +    TEST_COMPARE (sigemptyset (&empty_mask), 0);
> +    for (int i = 0; i < array_length (signal_to_check); i++)
> +      TEST_COMPARE (sigaction (signal_to_check[i],
> +			       &((struct sigaction) { .sa_handler = SIG_IGN,
> +						      .sa_mask = empty_mask,
> +						      .sa_flags = 0 }),
> +			       &default_act[i]),
> +		    0);
> +  }
> +
> +  reexecute (SPAWN_SETSIGDEF, &attr);
> +
> +  /* Restore signal dispositions.  */
> +  for (int i = 0; i < array_length (signal_to_check); i++)
> +    TEST_COMPARE (sigaction (signal_to_check[i], &default_act[i], NULL), 0);
> +}
> +
> +static int
> +do_test (int argc, char *argv[])
> +{
> +  /* We must have one or four parameters left if called initially:
> +       + path for ld.so		optional
> +       + "--library-path"	optional
> +       + the library path	optional
> +       + the application name
> +
> +     Plus one parameter to indicate which test to execute through
> +     re-execution.
> +
> +     So for default usage without --enable-hardcoded-path-in-tests, it
> +     will be called initially with 5 arguments and later with 2.  For
> +     --enable-hardcoded-path-in-tests it will be called with 2 arguments
> +     regardless.  */
> +
> +  if (argc != (restart ? 2 : 5) && argc != 2)
> +    FAIL_EXIT1 ("wrong number of arguments (%d)", argc);
> +
> +  if (restart)
> +    return handle_restart (atoi (argv[1]));
> +
> +  {
> +    int i;
> +    for (i = 0; i < (argc == 5 ? 4 : 1); i++)
> +      initial_spargv[i] = argv[i + 1];
> +    initial_spargv[i++] = (char *) "--direct";
> +    initial_spargv[i++] = (char *) "--restart";
> +    initial_spargv_size = i;
> +  }
> +
> +  do_test_setsigmask ();
> +  do_test_setsigdef ();
> +
> +  return 0;
> +}
> +
> +#define TEST_FUNCTION_ARGV do_test
> +#include <support/test-driver.c>
> diff --git a/sysdeps/generic/sigsetops.h b/sysdeps/generic/sigsetops.h
> index ddeeb0b0d5..9cae11771b 100644
> --- a/sysdeps/generic/sigsetops.h
> +++ b/sysdeps/generic/sigsetops.h
> @@ -66,6 +66,13 @@
>      0;						\
>    }))
>  
> +# define __sigorset_atomic(set)			\
> +  (__extension__ ({				\
> +    __sigset_t __mask = __sigmask (sig);	\
> +    atomic_fetch_or_seq_cst (set, mask);	\
> +    0;						\
> +  }))
> +
>  # define __sigdelset(set, sig)			\
>    (__extension__ ({				\
>      __sigset_t __mask = __sigmask (sig);	\
> diff --git a/sysdeps/unix/sysv/linux/internal-signals.h b/sysdeps/unix/sysv/linux/internal-signals.h
> index 3562011d21..385442f81e 100644
> --- a/sysdeps/unix/sysv/linux/internal-signals.h
> +++ b/sysdeps/unix/sysv/linux/internal-signals.h
> @@ -88,4 +88,7 @@ __libc_signal_restore_set (const sigset_t *set)
>  /* Used to communicate with signal handler.  */
>  extern struct xid_command *__xidcmd attribute_hidden;
>  
> +/* Used to obtained the modified signal handlers.  */
> +extern void __get_sighandler_set (sigset_t *set) attribute_hidden;
> +
>  #endif
> diff --git a/sysdeps/unix/sysv/linux/sigaction.c b/sysdeps/unix/sysv/linux/sigaction.c
> index 52722b08ae..3bcf3946ab 100644
> --- a/sysdeps/unix/sysv/linux/sigaction.c
> +++ b/sysdeps/unix/sysv/linux/sigaction.c
> @@ -20,6 +20,7 @@
>  #include <string.h>
>  
>  #include <sysdep.h>
> +#include <sigsetops.h>
>  #include <sys/syscall.h>
>  
>  /* New ports should not define the obsolete SA_RESTORER, however some
> @@ -36,6 +37,13 @@
>  # define STUB(act, sigsetsize) (sigsetsize)
>  #endif
>  
> +static sigset_t handler_set;
> +
> +void __get_sighandler_set (sigset_t *set)
> +{
> +  *set = handler_set;
> +}
> +
>  /* If ACT is not NULL, change the action for SIG to *ACT.
>     If OACT is not NULL, put the old action for SIG in *OACT.  */
>  int
> @@ -47,6 +55,15 @@ __libc_sigaction (int sig, const struct sigaction *act, struct sigaction *oact)
>  
>    if (act)
>      {
> +      /* Tracks which signal had a signal handler set different from default
> +	 (SIG_DFL/SIG_IGN).  It allows optimize posix_spawn to reset only
> +	 those signals.  It might incur in false positive, since it not easy
> +	 to remove bits from the mask without race conditions, but it does not
> +	 allow false negative since the mask is updated atomically prior the
> +	 syscall.  The false positive incur in just extra sigactions on
> +	 posix_spawn.  */
> +      if (act->sa_handler != SIG_DFL && act->sa_handler != SIG_IGN)
> +	__sigaddset_atomic (&handler_set, sig);
>        kact.k_sa_handler = act->sa_handler;
>        memcpy (&kact.sa_mask, &act->sa_mask, sizeof (sigset_t));
>        kact.sa_flags = act->sa_flags;
> diff --git a/sysdeps/unix/sysv/linux/sigsetops.h b/sysdeps/unix/sysv/linux/sigsetops.h
> index 713d4840d8..6c98c83e42 100644
> --- a/sysdeps/unix/sysv/linux/sigsetops.h
> +++ b/sysdeps/unix/sysv/linux/sigsetops.h
> @@ -20,6 +20,7 @@
>  #define _SIGSETOPS_H 1
>  
>  #include <signal.h>
> +#include <atomic.h>
>  
>  /* Return a mask that includes the bit for SIG only.  */
>  # define __sigmask(sig) \
> @@ -80,6 +81,12 @@
>      (void)0;							\
>    }))
>  
> +# define __sigorset_atomic(dest, left, right)	\
> +  (__extension__ ({				\
> +     atomic_fetch_or_seq_cst (dest, left, right); \
> +    0;						\
> +  }))
> +
>  /* These macros needn't check for a bogus signal number;
>     error checking is done in the non-__ versions.  */
>  # define __sigismember(set, sig)				\
> @@ -97,6 +104,14 @@
>      (void)0;							\
>    }))
>  
> +# define __sigaddset_atomic(set, sig)				\
> +  (__extension__ ({						\
> +    unsigned long int __mask = __sigmask (sig);			\
> +    unsigned long int __word = __sigword (sig);			\
> +    atomic_fetch_or_seq_cst (&((set)->__val[__word]), __mask);	\
> +    (void)0;							\
> +  }))
> +
>  # define __sigdelset(set, sig)					\
>    (__extension__ ({						\
>      unsigned long int __mask = __sigmask (sig);			\
> diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
> index 0f7a8ca5df..264edd09c6 100644
> --- a/sysdeps/unix/sysv/linux/spawni.c
> +++ b/sysdeps/unix/sysv/linux/spawni.c
> @@ -132,17 +132,14 @@ spawni_child (void *arguments)
>    const posix_spawnattr_t *restrict attr = args->attr;
>    const posix_spawn_file_actions_t *file_actions = args->fa;
>  
> -  /* The child must ensure that no signal handler are enabled because it shared
> +  /* The child must ensure that no signal handler are enabled because it share
>       memory with parent, so the signal disposition must be either SIG_DFL or
> -     SIG_IGN.  It does by iterating over all signals and although it could
> -     possibly be more optimized (by tracking which signal potentially have a
> -     signal handler), it might requires system specific solutions (since the
> -     sigset_t data type can be very different on different architectures).  */
> +     SIG_IGN.  */
>    struct sigaction sa;
>    memset (&sa, '\0', sizeof (sa));
>  
>    sigset_t hset;
> -  __sigprocmask (SIG_BLOCK, 0, &hset);
> +  __get_sighandler_set (&hset);
>    for (int sig = 1; sig < _NSIG; ++sig)
>      {
>        if ((attr->__flags & POSIX_SPAWN_SETSIGDEF)
> 

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 3/5] posix: Optimize stack Linux posix_spawn
  2019-07-31 18:31 ` [PATCH v2 3/5] posix: Optimize stack Linux posix_spawn Adhemerval Zanella
@ 2019-08-28 14:09   ` Adhemerval Zanella
  2019-10-07 17:50     ` Adhemerval Zanella
  0 siblings, 1 reply; 59+ messages in thread
From: Adhemerval Zanella @ 2019-08-28 14:09 UTC (permalink / raw)
  To: libc-alpha

Ping.

On 31/07/2019 15:31, Adhemerval Zanella wrote:
> Changes from previous version:
> 
>   * Move the logic of stack mapping creation to stackmap.h and
>     added a guard page allocation for the compatibility case.
> 
> --
> 
> The current internal posix_spawn symbol for Linux (__spawni) requires
> to allocate a dynamic stack based on input arguments to handle the
> SPAWN_XFLAGS_USE_PATH internal flag, which re-issue the input binary
> as a shell script if execve call return ENOEXEC (to execute shell
> scripts with an initial shebang).
> 
> This is done only for compatibility mode and the generic case does not
> require the extra calculation plus the potential large mmap/munmap
> call.  For default case, a pre-defined buffer is sufficed to use on the
> clone call instead.
> 
> This patch optimizes Linux spawni by allocating a dynamic stack only
> for compatibility symbol (SPAWN_XFLAGS_USE_PATH).  For generic case,
> an mmap allocated buffer is used along with a guard page, similar to
> what NPTL uses for thread stacks hardening.
> 
> For default case, it is a fixed code path with fixed stack usage in helper
> process, so assuming a large enough stack buffer it would never overflow.
> It also does not prevent to adapt to the vfork-like to re-use process
> stack, once it is implemented.
> 
> Checked x86_64-linux-gnu and i686-linux-gnu.
> 
> 	* sysdeps/unix/sysv/linux/spawni.c (posix_spawn_args): Remove
> 	argc member.
> 	(maybe_script_execute): Remove function.
> 	(execve_compat, __spawni_clone, __spawnix_compat): New function.
> 	(__spawni_child): Remove maybe_script_execute call.
> 	(__spawnix): Remove magic stack slack constant with stack_slack
> 	identifier.
> 	(__spawni): Only allocates a variable stack when
> 	SPAWN_XFLAGS_TRY_SHELL is used.
> 	* posix/stackmap.h: New file.
> 	* sysdeps/ia64/nptl/pthreaddef.h (NEED_SEPARATE_REGISTER_STACK): Move
> 	to ...
> 	* sysdeps/ia64/stackinfo.h: ... here.
> ---
>  posix/stackmap.h                 | 115 +++++++++++++
>  sysdeps/ia64/nptl/pthreaddef.h   |   3 -
>  sysdeps/ia64/stackinfo.h         |   3 +
>  sysdeps/unix/sysv/linux/spawni.c | 277 +++++++++++++++++++------------
>  4 files changed, 285 insertions(+), 113 deletions(-)
>  create mode 100644 posix/stackmap.h
> 
> diff --git a/posix/stackmap.h b/posix/stackmap.h
> new file mode 100644
> index 0000000000..be500e378a
> --- /dev/null
> +++ b/posix/stackmap.h
> @@ -0,0 +1,115 @@
> +/* Functions to create stack mappings for helper processes.
> +   Copyright (C) 2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef _STACKMAP_H
> +#define _STACKMAP_H
> +
> +#include <unistd.h>
> +#include <sys/mman.h>
> +#include <ldsodefs.h>
> +#include <stdbool.h>
> +
> +static inline int
> +stack_prot (void)
> +{
> +  return (PROT_READ | PROT_WRITE
> +	  | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
> +}
> +
> +static inline size_t
> +stack_guard_size (void)
> +{
> + return GLRO (dl_pagesize);
> +}
> +
> +/* Return a aligning mask based on system pagesize.  */
> +static inline size_t
> +stack_pagesize_m1_mask (void)
> +{
> +  size_t pagesize_m1 = __getpagesize () - 1;
> +  return ~pagesize_m1;
> +}
> +
> +/* Return the guard page position on memory segment MEM with total size SIZE
> +   and with a guard page of size GUARDIZE.  */
> +static inline void *
> +stack_guard_position (void *mem, size_t size, size_t guardsize)
> +{
> +#ifdef NEED_SEPARATE_REGISTER_STACK
> +  return mem + (((size - guardsize) / 2) & stack_pagesize_m1_mask ());
> +#elif _STACK_GROWS_DOWN
> +  return mem;
> +#elif _STACK_GROWS_UP
> +  return (void *) (((uintptr_t)(mem + size)- guardsize)
> +		   & stack_pagesize_m1_mask ());
> +#endif
> +}
> +
> +/* Setup the expected stack memory protection value (based on stack_prot)
> +   for the memory segment MEM with size SIZE based on the guard page
> +   GUARD with size GUARDSIZE.  The memory segment is expected to be allocated
> +   with PROT_NOTE.  */
> +static inline bool
> +stack_setup_prot (char *mem, size_t size, char *guard, size_t guardsize)
> +{
> +  const int prot = stack_prot ();
> +
> +  char *guardend = guard + guardsize;
> +#if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
> +  /* As defined at guard_position, for architectures with downward stack
> +     the guard page is always at start of the allocated area.  */
> +  if (__mprotect (guardend, size - guardsize, prot) != 0)
> +    return false;
> +#else
> +  size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
> +  if (__mprotect (mem, mprots1, prot) != 0)
> +    return false;
> +  size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
> +  if (__mprotect (guardend, mprots2, prot) != 0)
> +    return false;
> +#endif
> +  return true;
> +}
> +
> +/* Allocated a memory segment with size SIZE plus GUARSIZE with mmap and
> +   setup the expected protection for both a guard page and the stack
> +   itself.  */
> +static inline void *
> +stack_allocate (size_t size, size_t guardsize)
> +{
> +  const int prot = stack_prot ();
> +
> +  /* If a guard page is required, avoid committing memory by first
> +     allocate with PROT_NONE and then reserve with required permission
> +     excluding the guard page.  */
> +  void *mem = __mmap (NULL, size, (guardsize == 0) ? prot : PROT_NONE,
> +		      MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
> +  if (guardsize)
> +    {
> +      void *guard = stack_guard_position (mem, size, guardsize);
> +      if (!stack_setup_prot (mem, size, guard, guardsize))
> +	{
> +	  __munmap (mem, size);
> +	  return MAP_FAILED;
> +	}
> +    }
> +
> +  return mem;
> +}
> +
> +#endif /* _STACKMAP_H  */
> diff --git a/sysdeps/ia64/nptl/pthreaddef.h b/sysdeps/ia64/nptl/pthreaddef.h
> index bf52d5af62..11579f11b4 100644
> --- a/sysdeps/ia64/nptl/pthreaddef.h
> +++ b/sysdeps/ia64/nptl/pthreaddef.h
> @@ -18,9 +18,6 @@
>  /* Default stack size.  */
>  #define ARCH_STACK_DEFAULT_SIZE	(32 * 1024 * 1024)
>  
> -/* IA-64 uses a normal stack and a register stack.  */
> -#define NEED_SEPARATE_REGISTER_STACK
> -
>  /* Required stack pointer alignment at beginning.  */
>  #define STACK_ALIGN		16
>  
> diff --git a/sysdeps/ia64/stackinfo.h b/sysdeps/ia64/stackinfo.h
> index 6433a89945..d942426fcf 100644
> --- a/sysdeps/ia64/stackinfo.h
> +++ b/sysdeps/ia64/stackinfo.h
> @@ -30,4 +30,7 @@
>  /* Default to a non-executable stack.  */
>  #define DEFAULT_STACK_PERMS (PF_R|PF_W)
>  
> +/* IA-64 uses a normal stack and a register stack.  */
> +#define NEED_SEPARATE_REGISTER_STACK
> +
>  #endif	/* stackinfo.h */
> diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
> index ca7bf99825..0f7a8ca5df 100644
> --- a/sysdeps/unix/sysv/linux/spawni.c
> +++ b/sysdeps/unix/sysv/linux/spawni.c
> @@ -23,10 +23,11 @@
>  #include <not-cancel.h>
>  #include <local-setxid.h>
>  #include <shlib-compat.h>
> -#include <sigsetops.h>
> -#include <internal-signals.h>
> -#include <ldsodefs.h>
> +#include <nptl/pthreadP.h>
>  #include <ctype.h>
> +#include <dl-sysdep.h>
> +#include <libc-pointer-arith.h>
> +#include <stackmap.h>
>  #include "spawn_int.h"
>  
>  /* The Linux implementation of posix_spawn{p} uses the clone syscall directly
> @@ -70,7 +71,6 @@
>  # define STACK(__stack, __stack_size) (__stack + __stack_size)
>  #endif
>  
> -
>  struct posix_spawn_args
>  {
>    sigset_t oldmask;
> @@ -79,37 +79,11 @@ struct posix_spawn_args
>    const posix_spawn_file_actions_t *fa;
>    const posix_spawnattr_t *restrict attr;
>    char *const *argv;
> -  ptrdiff_t argc;
>    char *const *envp;
>    int xflags;
>    int err;
>  };
>  
> -/* Older version requires that shell script without shebang definition
> -   to be called explicitly using /bin/sh (_PATH_BSHELL).  */
> -static void
> -maybe_script_execute (struct posix_spawn_args *args)
> -{
> -  if (SHLIB_COMPAT (libc, GLIBC_2_2, GLIBC_2_15)
> -      && (args->xflags & SPAWN_XFLAGS_TRY_SHELL) && errno == ENOEXEC)
> -    {
> -      char *const *argv = args->argv;
> -      ptrdiff_t argc = args->argc;
> -
> -      /* Construct an argument list for the shell.  */
> -      char *new_argv[argc + 2];
> -      new_argv[0] = (char *) _PATH_BSHELL;
> -      new_argv[1] = (char *) args->file;
> -      if (argc > 1)
> -	memcpy (new_argv + 2, argv + 1, argc * sizeof (char *));
> -      else
> -	new_argv[2] = NULL;
> -
> -      /* Execute the shell.  */
> -      args->exec (new_argv[0], new_argv, args->envp);
> -    }
> -}
> -
>  /* Close all file descriptor up to FROM by interacting /proc/self/fd.  */
>  static bool
>  spawn_closefrom (int from)
> @@ -152,7 +126,7 @@ spawn_closefrom (int from)
>     attributes, and file actions.  It run on its own stack (provided by the
>     posix_spawn call).  */
>  static int
> -__spawni_child (void *arguments)
> +spawni_child (void *arguments)
>  {
>    struct posix_spawn_args *args = arguments;
>    const posix_spawnattr_t *restrict attr = args->attr;
> @@ -330,11 +304,6 @@ __spawni_child (void *arguments)
>  
>    args->exec (args->file, args->argv, args->envp);
>  
> -  /* This is compatibility function required to enable posix_spawn run
> -     script without shebang definition for older posix_spawn versions
> -     (2.15).  */
> -  maybe_script_execute (args);
> -
>  fail:
>    /* errno should have an appropriate non-zero value; otherwise,
>       there's a bug in glibc or the kernel.  For lack of an error code
> @@ -345,71 +314,12 @@ fail:
>    _exit (SPAWN_ERROR);
>  }
>  
> -/* Spawn a new process executing PATH with the attributes describes in *ATTRP.
> -   Before running the process perform the actions described in FILE-ACTIONS. */
>  static int
> -__spawnix (pid_t * pid, const char *file,
> -	   const posix_spawn_file_actions_t * file_actions,
> -	   const posix_spawnattr_t * attrp, char *const argv[],
> -	   char *const envp[], int xflags,
> -	   int (*exec) (const char *, char *const *, char *const *))
> +spawni_clone (struct posix_spawn_args *args, void *stack, size_t stack_size,
> +	      pid_t *pid)
>  {
> -  pid_t new_pid;
> -  struct posix_spawn_args args;
>    int ec;
> -
> -  /* To avoid imposing hard limits on posix_spawn{p} the total number of
> -     arguments is first calculated to allocate a mmap to hold all possible
> -     values.  */
> -  ptrdiff_t argc = 0;
> -  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
> -     to be used in a execve call.  We limit to INT_MAX minus one due the
> -     compatiblity code that may execute a shell script (maybe_script_execute)
> -     where it will construct another argument list with an additional
> -     argument.  */
> -  ptrdiff_t limit = INT_MAX - 1;
> -  while (argv[argc++] != NULL)
> -    if (argc == limit)
> -      {
> -	errno = E2BIG;
> -	return errno;
> -      }
> -
> -  int prot = (PROT_READ | PROT_WRITE
> -	     | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
> -
> -  size_t argv_size = (argc * sizeof (void *));
> -  /* We need at least a few pages in case the compiler's stack checking is
> -     enabled.  In some configs, it is known to use at least 24KiB.  We use
> -     32KiB to be "safe" from anything the compiler might do.  Besides, the
> -     extra pages won't actually be allocated unless they get used.
> -     It also acts the slack for spawn_closefrom (including MIPS64 getdents64
> -     where it might use about 1k extra stack space.  */
> -  argv_size += (32 * 1024);
> -  size_t stack_size = ALIGN_UP (argv_size, GLRO(dl_pagesize));
> -  void *stack = __mmap (NULL, stack_size, prot,
> -			MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
> -  if (__glibc_unlikely (stack == MAP_FAILED))
> -    return errno;
> -
> -  /* Disable asynchronous cancellation.  */
> -  int state;
> -  __libc_ptf_call (__pthread_setcancelstate,
> -                   (PTHREAD_CANCEL_DISABLE, &state), 0);
> -
> -  /* Child must set args.err to something non-negative - we rely on
> -     the parent and child sharing VM.  */
> -  args.err = 0;
> -  args.file = file;
> -  args.exec = exec;
> -  args.fa = file_actions;
> -  args.attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 };
> -  args.argv = argv;
> -  args.argc = argc;
> -  args.envp = envp;
> -  args.xflags = xflags;
> -
> -  __libc_signal_block_all (&args.oldmask);
> +  pid_t new_pid;
>  
>    /* The clone flags used will create a new child that will run in the same
>       memory space (CLONE_VM) and the execution of calling thread will be
> @@ -419,8 +329,8 @@ __spawnix (pid_t * pid, const char *file,
>       need for CLONE_SETTLS.  Although parent and child share the same TLS
>       namespace, there will be no concurrent access for TLS variables (errno
>       for instance).  */
> -  new_pid = CLONE (__spawni_child, STACK (stack, stack_size), stack_size,
> -		   CLONE_VM | CLONE_VFORK | SIGCHLD, &args);
> +  new_pid = CLONE (spawni_child, STACK (stack, stack_size), stack_size,
> +		   CLONE_VM | CLONE_VFORK | SIGCHLD, args);
>  
>    /* It needs to collect the case where the auxiliary process was created
>       but failed to execute the file (due either any preparation step or
> @@ -433,7 +343,7 @@ __spawnix (pid_t * pid, const char *file,
>  	 only in case of failure, so in case of premature termination
>  	 due a signal args.err will remain zeroed and it will be up to
>  	 caller to actually collect it.  */
> -      ec = args.err;
> +      ec = args->err;
>        if (ec > 0)
>  	/* There still an unlikely case where the child is cancelled after
>  	   setting args.err, due to a positive error value.  Also there is
> @@ -446,14 +356,139 @@ __spawnix (pid_t * pid, const char *file,
>    else
>      ec = -new_pid;
>  
> -  __munmap (stack, stack_size);
> -
>    if ((ec == 0) && (pid != NULL))
>      *pid = new_pid;
>  
> -  __libc_signal_restore_set (&args.oldmask);
> +  return ec;
> +}
>  
> -  __libc_ptf_call (__pthread_setcancelstate, (state, NULL), 0);
> +#if SHLIB_COMPAT (libc, GLIBC_2_2, GLIBC_2_15)
> +/* This is compatibility function required to enable posix_spawn run
> +   script without shebang definition for older posix_spawn versions
> +   (2.15).  */
> +static int
> +execve_compat (const char *filename, char *const argv[], char *const envp[])
> +{
> +  __execve (filename, argv, envp);
> +
> +  if (errno == ENOEXEC)
> +    {
> +      char *const *cargv = argv;
> +      ptrdiff_t argc = 0;
> +      while (cargv[argc++] != NULL);
> +
> +      /* Construct an argument list for the shell.  */
> +      char *new_argv[argc + 2];
> +      new_argv[0] = (char *) _PATH_BSHELL;
> +      new_argv[1] = (char *) filename;
> +      if (argc > 1)
> +	memcpy (new_argv + 2, argv + 1, argc * sizeof (char *));
> +      else
> +	new_argv[2] = NULL;
> +
> +      /* Execute the shell.  */
> +      __execve (new_argv[0], new_argv, envp);
> +    }
> +
> +  return -1;
> +}
> +
> +/* Allocates a stack using mmap to call clone.  The stack size is based on
> +   number of arguments since it would be used on compat mode which may call
> +   execvpe/execve_compat.  */
> +static int
> +spawnix_compat (struct posix_spawn_args *args, pid_t *pid)
> +{
> +  char *const *argv = args->argv;
> +
> +  /* To avoid imposing hard limits on posix_spawn{p} the total number of
> +     arguments is first calculated to allocate a mmap to hold all possible
> +     values.  */
> +  ptrdiff_t argc = 0;
> +  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
> +     to be used in a execve call.  We limit to INT_MAX minus one due the
> +     compatiblity code that may execute a shell script (maybe_script_execute)
> +     where it will construct another argument list with an additional
> +     argument.  */
> +  ptrdiff_t limit = INT_MAX - 1;
> +  while (argv[argc++] != NULL)
> +    if (argc == limit)
> +      {
> +	errno = E2BIG;
> +	return errno;
> +      }
> +
> +  size_t argv_size = (argc * sizeof (void *));
> +  /* We need at least a few pages in case the compiler's stack checking is
> +     enabled.  In some configs, it is known to use at least 24KiB.  We use
> +     32KiB to be "safe" from anything the compiler might do.  Besides, the
> +     extra pages won't actually be allocated unless they get used.
> +     It also acts the slack for spawn_closefrom (including MIPS64 getdents64
> +     where it might use about 1k extra stack space.  */
> +  argv_size += (32 * 1024);
> +
> +  /* Allocate a stack with an extra guard page.  */
> +  size_t guard_size = stack_guard_size ();
> +  size_t stack_size = guard_size + ALIGN_UP (argv_size, __getpagesize ());
> +  void *stack = stack_allocate (stack_size, guard_size);
> +  if (__glibc_unlikely (stack == MAP_FAILED))
> +    return errno;
> +
> +  int ec = spawni_clone (args, stack, stack_size, pid);
> +
> +  __munmap (stack, stack_size);
> +
> +  return ec;
> +}
> +#endif
> +
> +/* For SPAWN_XFLAGS_TRY_SHELL we need to execute a script even without
> +   a shebang.  To accomplish it we pass as callback to spawni_child
> +   __execvpe (which call maybe_script_execute for such case) or
> +   execve_compat (which mimics the semantic using execve).  */
> +static int
> +spawn_process (struct posix_spawn_args *args, pid_t *pid)
> +{
> +  int ec;
> +
> +#if SHLIB_COMPAT (libc, GLIBC_2_2, GLIBC_2_15)
> +  if (args->xflags & SPAWN_XFLAGS_TRY_SHELL)
> +    {
> +      args->exec = args->xflags & SPAWN_XFLAGS_USE_PATH
> +		   ? __execvpe  : execve_compat;
> +      ec = spawnix_compat (args, pid);
> +    }
> +  else
> +#endif
> +    {
> +      args->exec = args->xflags & SPAWN_XFLAGS_USE_PATH
> +		   ? __execvpex : __execve;
> +
> +      /* spawni_clone stack usage need to take in consideration spawni_child
> +	 stack usage and subsequent functions called:
> +
> +	 - sigprocmask: might allocate an extra sigset_t (128 bytes).
> +	 - __libc_sigaction: allocate a struct kernel_sigaction (144 bytes on
> +	   64-bit, 136 on 32-bit).
> +	 - __sched_setparam, __sched_setscheduler, __setsig, __setpgid,
> +	   local_seteuid, local_setegid, __close_nocancel, __getrlimit64,
> +	   __close_nocancel, __open_nocancel, __dup2, __chdir, __fchdir:
> +	   and direct syscall.
> +	 - __fcntl: wrapper only uses local variables.
> +	 - spawn_closefrom: uses up to 1024 bytes as local buffer
> +	   - __direntries_read
> +	     - __getdents64: MIPS64 uses up to buffer size used, 1024 in this
> +	       specific usage.
> +	   - __direntries_next: local variables.
> +	   - __close_nocancel: direct syscall.
> +         - execvpe allocates at least (NAME_MAX + 1) + PATH_MAX to create the
> +	   combination of PATH entry and program name (1024 + 255 + 1).
> +
> +	 It allocates 2048 plus some stack for automatic variables and function
> +	 calls.  */
> +      char stack[2560];
> +      ec = spawni_clone (args, stack, sizeof stack, pid);
> +    }
>  
>    return ec;
>  }
> @@ -462,12 +497,34 @@ __spawnix (pid_t * pid, const char *file,
>     Before running the process perform the actions described in FILE-ACTIONS. */
>  int
>  __spawni (pid_t * pid, const char *file,
> -	  const posix_spawn_file_actions_t * acts,
> +	  const posix_spawn_file_actions_t * file_actions,
>  	  const posix_spawnattr_t * attrp, char *const argv[],
>  	  char *const envp[], int xflags)
>  {
> -  /* It uses __execvpex to avoid run ENOEXEC in non compatibility mode (it
> -     will be handled by maybe_script_execute).  */
> -  return __spawnix (pid, file, acts, attrp, argv, envp, xflags,
> -		    xflags & SPAWN_XFLAGS_USE_PATH ? __execvpex :__execve);
> +  /* Child must set args.err to something non-negative - we rely on
> +     the parent and child sharing VM.  */
> +  struct posix_spawn_args args = {
> +    .err = 0,
> +    .file = file,
> +    .fa = file_actions,
> +    .attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 },
> +    .argv = argv,
> +    .envp = envp,
> +    .xflags = xflags
> +  };
> +
> +  /* Disable asynchronous cancellation.  */
> +  int state;
> +  __libc_ptf_call (__pthread_setcancelstate,
> +                   (PTHREAD_CANCEL_DISABLE, &state), 0);
> +
> +  __libc_signal_block_all (&args.oldmask);
> +
> +  int ec = spawn_process (&args, pid);
> +
> +  __libc_signal_restore_set (&args.oldmask);
> +
> +  __libc_ptf_call (__pthread_setcancelstate, (state, NULL), 0);
> +
> +  return ec;
>  }
> 

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 2/5] posix: Add posix_spawn_file_actions_closefrom
  2019-07-31 18:31 ` [PATCH v2 2/5] posix: Add posix_spawn_file_actions_closefrom Adhemerval Zanella
@ 2019-08-28 14:09   ` Adhemerval Zanella
  2019-08-28 17:22     ` Joseph Myers
  0 siblings, 1 reply; 59+ messages in thread
From: Adhemerval Zanella @ 2019-08-28 14:09 UTC (permalink / raw)
  To: libc-alpha

Ping.

On 31/07/2019 15:31, Adhemerval Zanella wrote:
> Changes from previous version:
> 
>   - Remove __spawn_valid_fd check on posix_spawn_file_actions_addclosefrom.
> 
>   - Rename __SUPPORT_SPAWN_CLOSEFROM macro to __SPAWN_SUPPORT_CLOSEFROM.
> 
>   - Use getdents64 instead of changing add internal __opendir_inplace.
> 
> Regarding the lseek to reset the descriptor offset after each file removal,
> I think that by iteracting the descriptor based on getdents64 results there
> is no real need for it.  The kernel returns the fds sequentially and there
> is no open operations done concurrently, so both my expectations and the
> tests result is getdents returns an updated version after a close() call.
> 
> --
> 
> This patch adds a way to close a range of file descriptors on posix_spawn
> as a new file action.  The API is similar to the one provided by Solaris
> 11 [1], where the file action causes the all open file descriptors greater
> than or equal to input on to be closed when the new process is spawned.
> 
> There are some discussions on BZ#10353 [2], although the bug itself asks
> for a generic solution (similar to the closeall provided by some BSD).
> The posix_spawn is safe to be implemented by interacting over /proc/self/fd,
> the Linux spawni.c does not use CLONE_FILES, so the helper process has its
> own file descriptor table and any failure (in /proc operation) aborts the
> process creation and returns an error to the caller.
> 
> I am aware that this file action might be redundant to the current approach
> of POSIX in promoting O_CLOEXEC in more interfaces. However O_CLOEXEC is still
> not the default and for some specific usages, the caller needs to close all
> possible file descriptors to avoid them leaking.  Some examples are CPython
> (discussed in BZ#10353) and OpenJDK jspawnhelper [3] (where OpenJDK spawns a
> helper process to exactly closes all file descriptors).  Most likely any
> environment which calls functions that might open file descriptor under the
> hood and aim to use posix_spawn might face the same requirement.
> 
> Checked on x86_64-linux-gnu, i686-linux-gnu, powerpc64le-linux-gnu, and
> aarch64-linux-gnu.
> 
> 	* posix/Makefile (routines): Add spawn_faction_addclosefrom.
> 	(tests): Add tst-spawn5.
> 	(tst-spawn5-ARGS): New rule.
> 	* posix/Versions [GLIBC_2.30] (libc): Add
> 	posix_spawn_file_actions_addclosefrom_np.
> 	* posix/spawn.h (posix_spawn_file_actions_addclosefrom_np): New
> 	prototype.
> 	* posix/spawn_faction_addclosefrom.c: New file
> 	* posix/spawn_faction_destroy.c (__posix_spawn_file_actions_destroy):
> 	Handle spawn_do_closefrom.
> 	* posix/spawn_int.h (__spawn_action): Add closefrom_action and
> 	spawn_do_closefrom.
> 	* posix/spawn_int_abi.h: New file.
> 	* sysdeps/unix/sysv/linux/spawn_int_abi.h: Likewise.
> 	* posix/tst-spawn5.c: Likewise.
> 	* sysdeps/mach/hurd/spawni.c (__spawni, __spawni_child): Handle
> 	spawn_do_closefrom.
> 	* sysdeps/posix/spawni.c (__spawni_child): Likewise.
> 	* sysdeps/unix/sysv/linux/spawni.c (__spawni_child, __spawnix):
> 	Likewise.
> 	(spawn_closefrom): New function.
> 	* sysdeps/mach/hurd/i386/libc.abilist (2.30): Add
> 	posix_spawn_file_actions_addclosefrom_np.
> 	* sysdeps/unix/sysv/linux/aarch64/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/alpha/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/arm/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/csky/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/hppa/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/i386/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/ia64/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/microblaze/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/nios2/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist:
> 	Likewise.
> 	* sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/sh/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/unix/sysv/linux/x86_64/64/libc.abilist: Likewise.
> 	* sysdeps/unix/sysv/unix/sysv/linux/x86_64/x32/libc.abilist: Likewise.
> ---
>  posix/Makefile                                |   5 +-
>  posix/Versions                                |   1 +
>  posix/spawn.h                                 |   7 +
>  posix/spawn_faction_addclosefrom.c            |  58 +++++
>  posix/spawn_faction_destroy.c                 |   1 +
>  posix/spawn_int.h                             |   6 +
>  posix/spawn_int_abi.h                         |  27 +++
>  posix/tst-spawn5.c                            | 210 ++++++++++++++++++
>  sysdeps/mach/hurd/i386/libc.abilist           |   1 +
>  sysdeps/mach/hurd/spawni.c                    |   4 +
>  sysdeps/posix/spawni.c                        |   4 +
>  sysdeps/unix/sysv/linux/aarch64/libc.abilist  |   1 +
>  sysdeps/unix/sysv/linux/alpha/libc.abilist    |   1 +
>  sysdeps/unix/sysv/linux/arm/libc.abilist      |   1 +
>  sysdeps/unix/sysv/linux/csky/libc.abilist     |   1 +
>  sysdeps/unix/sysv/linux/hppa/libc.abilist     |   1 +
>  sysdeps/unix/sysv/linux/i386/libc.abilist     |   1 +
>  sysdeps/unix/sysv/linux/ia64/libc.abilist     |   1 +
>  .../sysv/linux/m68k/coldfire/libc.abilist     |   1 +
>  .../unix/sysv/linux/m68k/m680x0/libc.abilist  |   1 +
>  .../unix/sysv/linux/microblaze/libc.abilist   |   1 +
>  .../sysv/linux/mips/mips32/fpu/libc.abilist   |   1 +
>  .../sysv/linux/mips/mips32/nofpu/libc.abilist |   1 +
>  .../sysv/linux/mips/mips64/n32/libc.abilist   |   1 +
>  .../sysv/linux/mips/mips64/n64/libc.abilist   |   1 +
>  sysdeps/unix/sysv/linux/nios2/libc.abilist    |   1 +
>  .../linux/powerpc/powerpc32/fpu/libc.abilist  |   1 +
>  .../linux/powerpc/powerpc64/be/libc.abilist   |   1 +
>  .../linux/powerpc/powerpc64/le/libc.abilist   |   1 +
>  .../unix/sysv/linux/riscv/rv64/libc.abilist   |   1 +
>  .../unix/sysv/linux/s390/s390-32/libc.abilist |   1 +
>  .../unix/sysv/linux/s390/s390-64/libc.abilist |   1 +
>  sysdeps/unix/sysv/linux/sh/libc.abilist       |   1 +
>  .../sysv/linux/sparc/sparc32/libc.abilist     |   1 +
>  .../sysv/linux/sparc/sparc64/libc.abilist     |   1 +
>  sysdeps/unix/sysv/linux/spawn_int_abi.h       |  25 +++
>  sysdeps/unix/sysv/linux/spawni.c              |  62 +++++-
>  .../unix/sysv/linux/x86_64/64/libc.abilist    |   1 +
>  .../unix/sysv/linux/x86_64/x32/libc.abilist   |   1 +
>  39 files changed, 425 insertions(+), 12 deletions(-)
>  create mode 100644 posix/spawn_faction_addclosefrom.c
>  create mode 100644 posix/spawn_int_abi.h
>  create mode 100644 posix/tst-spawn5.c
>  create mode 100644 sysdeps/unix/sysv/linux/spawn_int_abi.h
> 
> diff --git a/posix/Makefile b/posix/Makefile
> index 8ac6743ad7..1ac41ad85a 100644
> --- a/posix/Makefile
> +++ b/posix/Makefile
> @@ -57,6 +57,7 @@ routines :=								      \
>  	spawn_faction_init spawn_faction_destroy spawn_faction_addclose	      \
>  	spawn_faction_addopen spawn_faction_adddup2 spawn_valid_fd	      \
>  	spawn_faction_addchdir spawn_faction_addfchdir			      \
> +	spawn_faction_addclosefrom					      \
>  	spawnattr_init spawnattr_destroy				      \
>  	spawnattr_getdefault spawnattr_setdefault			      \
>  	spawnattr_getflags spawnattr_setflags				      \
> @@ -100,7 +101,8 @@ tests		:= test-errno tstgetopt testfnm runtests runptests \
>  		   tst-posix_fadvise tst-posix_fadvise64 \
>  		   tst-sysconf-empty-chroot tst-glob_symlinks tst-fexecve \
>  		   tst-glob-tilde test-ssize-max tst-spawn4 bug-regex37 \
> -		   bug-regex38 tst-regcomp-truncated tst-spawn-chdir
> +		   bug-regex38 tst-regcomp-truncated tst-spawn-chdir \
> +		   tst-spawn5
>  tests-internal	:= bug-regex5 bug-regex20 bug-regex33 \
>  		   tst-rfc3484 tst-rfc3484-2 tst-rfc3484-3 \
>  		   tst-glob_lstat_compat tst-spawn4-compat
> @@ -254,6 +256,7 @@ tst-exec-static-ARGS = $(tst-exec-ARGS)
>  tst-execvpe5-ARGS = -- $(host-test-program-cmd)
>  tst-spawn-ARGS = -- $(host-test-program-cmd)
>  tst-spawn-static-ARGS = $(tst-spawn-ARGS)
> +tst-spawn5-ARGS = -- $(host-test-program-cmd)
>  tst-dir-ARGS = `pwd` `cd $(common-objdir)/$(subdir); pwd` `cd $(common-objdir); pwd` $(objpfx)tst-dir
>  tst-chmod-ARGS = $(objdir)
>  tst-vfork3-ARGS = --test-dir=$(objpfx)
> diff --git a/posix/Versions b/posix/Versions
> index 7d06a6d0c0..c8268e5996 100644
> --- a/posix/Versions
> +++ b/posix/Versions
> @@ -146,6 +146,7 @@ libc {
>      posix_spawn_file_actions_addfchdir_np;
>    }
>    GLIBC_2.30 {
> +    posix_spawn_file_actions_addclosefrom_np;
>    }
>    GLIBC_PRIVATE {
>      __libc_fork; __libc_pread; __libc_pwrite;
> diff --git a/posix/spawn.h b/posix/spawn.h
> index 471dbea022..773f416b2e 100644
> --- a/posix/spawn.h
> +++ b/posix/spawn.h
> @@ -213,6 +213,13 @@ extern int posix_spawn_file_actions_addchdir_np (posix_spawn_file_actions_t *
>  extern int posix_spawn_file_actions_addfchdir_np (posix_spawn_file_actions_t *,
>  						  int __fd)
>       __THROW __nonnull ((1));
> +
> +/* Add an action to close all file descriptor greater than FROM during
> +   spawn.  This affects the subsequent file actions.  */
> +extern int posix_spawn_file_actions_addclosefrom_np (posix_spawn_file_actions_t *,
> +						     int __from)
> +     __THROW __nonnull ((1));
> +
>  #endif
>  
>  __END_DECLS
> diff --git a/posix/spawn_faction_addclosefrom.c b/posix/spawn_faction_addclosefrom.c
> new file mode 100644
> index 0000000000..52e949c8b3
> --- /dev/null
> +++ b/posix/spawn_faction_addclosefrom.c
> @@ -0,0 +1,58 @@
> +/* Add a closefrom to a file action list for posix_spawn.
> +   Copyright (C) 2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <errno.h>
> +#include <spawn.h>
> +#include <unistd.h>
> +#include <spawn_int.h>
> +
> +int
> +__posix_spawn_file_actions_addclosefrom (posix_spawn_file_actions_t
> +					 *file_actions, int from)
> +{
> +#if __SPAWN_SUPPORT_CLOSEFROM
> +  struct __spawn_action *rec;
> +
> +  if (fd < 0)
> +    return EBADF;
> +
> +  /* Allocate more memory if needed.  */
> +  if (file_actions->__used == file_actions->__allocated
> +      && __posix_spawn_file_actions_realloc (file_actions) != 0)
> +    /* This can only mean we ran out of memory.  */
> +    return ENOMEM;
> +
> +  /* Add the new value.  */
> +  rec = &file_actions->__actions[file_actions->__used];
> +  rec->tag = spawn_do_closefrom;
> +  rec->action.closefrom_action.from = from;
> +
> +  /* Account for the new entry.  */
> +  ++file_actions->__used;
> +
> +  return 0;
> +#else
> +  __set_errno (EINVAL);
> +  return -1;
> +#endif
> +}
> +weak_alias (__posix_spawn_file_actions_addclosefrom,
> +	    posix_spawn_file_actions_addclosefrom_np)
> +#if !__SPAWN_SUPPORT_CLOSEFROM
> +stub_warning (posix_spawn_file_actions_addclosefrom_np)
> +#endif
> diff --git a/posix/spawn_faction_destroy.c b/posix/spawn_faction_destroy.c
> index 51fab13585..b45d1cd889 100644
> --- a/posix/spawn_faction_destroy.c
> +++ b/posix/spawn_faction_destroy.c
> @@ -39,6 +39,7 @@ __posix_spawn_file_actions_destroy (posix_spawn_file_actions_t *file_actions)
>  	case spawn_do_close:
>  	case spawn_do_dup2:
>  	case spawn_do_fchdir:
> +	case spawn_do_closefrom:
>  	  /* No cleanup required.  */
>  	  break;
>  	}
> diff --git a/posix/spawn_int.h b/posix/spawn_int.h
> index 93b7597f90..0bc29226e4 100644
> --- a/posix/spawn_int.h
> +++ b/posix/spawn_int.h
> @@ -20,6 +20,7 @@
>  #define _SPAWN_INT_H
>  
>  #include <spawn.h>
> +#include <spawn_int_abi.h>
>  #include <stdbool.h>
>  
>  /* Data structure to contain the action information.  */
> @@ -32,6 +33,7 @@ struct __spawn_action
>      spawn_do_open,
>      spawn_do_chdir,
>      spawn_do_fchdir,
> +    spawn_do_closefrom,
>    } tag;
>  
>    union
> @@ -60,6 +62,10 @@ struct __spawn_action
>      {
>        int fd;
>      } fchdir_action;
> +    struct
> +    {
> +      int from;
> +    } closefrom_action;
>    } action;
>  };
>  
> diff --git a/posix/spawn_int_abi.h b/posix/spawn_int_abi.h
> new file mode 100644
> index 0000000000..142efed339
> --- /dev/null
> +++ b/posix/spawn_int_abi.h
> @@ -0,0 +1,27 @@
> +/* Internal ABI specific for posix_spawn functionality.  Generic version.
> +   Copyright (C) 2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef _SPAWN_INT_ABI_H
> +#define _SPAWN_INT_ABI_H
> +
> +/* The closefrom file actions requires either a syscall or an arch-specific
> +   way to interact over all file descriptors and act uppon them (such
> +   /proc/self/fd on Linux).  */
> +#define __SPAWN_SUPPOR_CLOSEFROM 0
> +
> +#endif /* _SPAWN_INT_H */
> diff --git a/posix/tst-spawn5.c b/posix/tst-spawn5.c
> new file mode 100644
> index 0000000000..7af33a4dbe
> --- /dev/null
> +++ b/posix/tst-spawn5.c
> @@ -0,0 +1,210 @@
> +/* Tests for posix_spawn signal handling.
> +   Copyright (C) 2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#include <stdio.h>
> +#include <stdlib.h>
> +#include <getopt.h>
> +#include <spawn.h>
> +#include <fcntl.h>
> +#include <sys/wait.h>
> +#include <dirent.h>
> +#include <stdbool.h>
> +#include <errno.h>
> +#include <limits.h>
> +
> +#include <support/check.h>
> +#include <support/xunistd.h>
> +#include <support/support.h>
> +#include <array_length.h>
> +
> +/* Nonzero if the program gets called via `exec'.  */
> +static int restart;
> +#define CMDLINE_OPTIONS \
> +  { "restart", no_argument, &restart, 1 },
> +
> +/* Called on process re-execution.  */
> +static int
> +handle_restart (int from)
> +{
> +  DIR *fds = opendir ("/proc/self/fd");
> +  if (fds == NULL)
> +    FAIL_EXIT1 ("opendir (\"/proc/self/fd\"): %m");
> +
> +  while (true)
> +    {
> +      errno = 0;
> +      struct dirent64 *e = readdir64 (fds);
> +      if (e == NULL)
> +        {
> +          if (errno != 0)
> +            FAIL_EXIT1 ("readdir: %m");
> +          break;
> +        }
> +
> +      if (e->d_name[0] == '.')
> +        continue;
> +
> +      char *endptr;
> +      long int fd = strtol (e->d_name, &endptr, 10);
> +      if (*endptr != '\0' || fd < 0 || fd > INT_MAX)
> +        FAIL_EXIT1 ("readdir: invalid file descriptor name: /proc/self/fd/%s",
> +                    e->d_name);
> +
> +      /* Skip the descriptor which is used to enumerate the
> +         descriptors.  */
> +      if (fd == dirfd (fds))
> +        continue;
> +
> +      struct stat64 st;
> +      if (fstat64 (fd, &st) != 0)
> +        FAIL_EXIT1 ("readdir: fstat64 (%ld) failed: %m", fd);
> +
> +      if (fd >= from)
> +	FAIL_EXIT1 ("error: fd (%ld) greater than from (%d)", fd, from);
> +    }
> +
> +  closedir (fds);
> +
> +  return 0;
> +}
> +
> +/* Common argument used for process re-execution.  */
> +static char *initial_spargv[5];
> +static size_t initial_spargv_size;
> +
> +/* Re-execute the test process with both '--direct', '--restart', and the
> +   TEST (as integer value) as arguments.  */
> +static void
> +reexecute (int fd, const posix_spawn_file_actions_t *fa)
> +{
> +  char *spargv[8];
> +  int i;
> +
> +  for (i = 0; i < initial_spargv_size; i++)
> +    spargv[i] = initial_spargv[i];
> +  /* Three digits per byte plus null terminator.  */
> +  char teststr[3 * sizeof (fd) + 1];
> +  snprintf (teststr, array_length (teststr), "%d", fd);
> +  spargv[i++] = teststr;
> +  spargv[i] = NULL;
> +  TEST_VERIFY (i < 8);
> +
> +  pid_t pid;
> +  int status;
> +
> +  TEST_COMPARE (posix_spawn (&pid, spargv[0], fa, NULL, spargv, environ),
> +		0);
> +  TEST_COMPARE (xwaitpid (pid, &status, 0), pid);
> +  TEST_VERIFY (WIFEXITED (status));
> +  TEST_VERIFY (!WIFSIGNALED (status));
> +  TEST_COMPARE (WEXITSTATUS (status), 0);
> +}
> +
> +static void
> +do_test_closefrom (int num_fd_to_open)
> +{
> +  int *fds = xmalloc (num_fd_to_open * sizeof (int));
> +  for (int i = 0; i < num_fd_to_open; i++)
> +    fds[i] = xopen ("/dev/null", O_WRONLY, 0);
> +
> +  posix_spawn_file_actions_t fa;
> +  /* posix_spawn_file_actions_init does not fail.  */
> +  posix_spawn_file_actions_init (&fa);
> +
> +  {
> +    int ret = posix_spawn_file_actions_addclosefrom_np (&fa, fds[0]);
> +    if (ret == -1)
> +      {
> +	if (errno == ENOSYS)
> +	  /* Hurd currently does not support closefrom fileaction.  */
> +	  FAIL_UNSUPPORTED ("posix_spawn_file_actions_addclosefrom_np unsupported");
> +        else
> +	  FAIL_EXIT1 ("posix_spawn_file_actions_addclosefrom_np failed");
> +      }
> +  }
> +
> +  /* Default check, all file descriptor from [fd[0], fd[1]) are opened.  */
> +  reexecute (fds[0], &fa);
> +
> +  /* Add a gap in the range.  */
> +  xclose (fds[num_fd_to_open/2]);
> +  xclose (fds[num_fd_to_open/2 + 1]);
> +  reexecute (fds[0], &fa);
> +
> +  /* Add another gap, at the beginning.  */
> +  xclose (fds[0]);
> +  xclose (fds[1]);
> +  reexecute (fds[0], &fa);
> +
> +  /* Add another gap, now at the end.  */
> +  xclose (fds[num_fd_to_open-1]);
> +  xclose (fds[num_fd_to_open-2]);
> +  reexecute (fds[0], &fa);
> +
> +  /* Open some more files, filling the gaps.  */
> +  for (int i = 0; i < 6; i++)
> +    xopen ("/dev/null", O_WRONLY, 0);
> +  reexecute (fds[0], &fa);
> +
> +  /* Open some more, but with O_CLOEXEC.  */
> +  for (int i = 0; i < num_fd_to_open/2; i++)
> +    xopen ("/dev/null", O_WRONLY | O_CLOEXEC, 0);
> +
> +  free (fds);
> +}
> +
> +
> +static int
> +do_test (int argc, char *argv[])
> +{
> +  /* We must have one or four parameters left if called initially:
> +       + path for ld.so		optional
> +       + "--library-path"	optional
> +       + the library path	optional
> +       + the application name
> +
> +     Plus one parameter to indicate which test to execute through
> +     re-execution.
> +
> +     So for default usage without --enable-hardcoded-path-in-tests, it
> +     will be called initially with 5 arguments and later with 2.  For
> +     --enable-hardcoded-path-in-tests it will be called with 2 arguments
> +     regardless.  */
> +
> +  if (argc != (restart ? 2 : 5) && argc != 2)
> +    FAIL_EXIT1 ("wrong number of arguments (%d)", argc);
> +
> +  if (restart)
> +    return handle_restart (atoi (argv[1]));
> +
> +  /* Respawn using the same arguments.  */
> +  for (initial_spargv_size = 0;
> +       initial_spargv_size < (argc == 5 ? 4 : 1);
> +       initial_spargv_size++)
> +    initial_spargv[initial_spargv_size] = argv[initial_spargv_size + 1];
> +  initial_spargv[initial_spargv_size++] = (char *) "--direct";
> +  initial_spargv[initial_spargv_size++] = (char *) "--restart";
> +
> +  do_test_closefrom (10);
> +  do_test_closefrom (100);
> +
> +  return 0;
> +}
> +
> +#define TEST_FUNCTION_ARGV do_test
> +#include <support/test-driver.c>
> diff --git a/sysdeps/mach/hurd/i386/libc.abilist b/sysdeps/mach/hurd/i386/libc.abilist
> index 1fc7ab2433..fcf957cfc6 100644
> --- a/sysdeps/mach/hurd/i386/libc.abilist
> +++ b/sysdeps/mach/hurd/i386/libc.abilist
> @@ -2175,6 +2175,7 @@ GLIBC_2.3.4 setipv4sourcefilter F
>  GLIBC_2.3.4 setsourcefilter F
>  GLIBC_2.3.4 xdr_quad_t F
>  GLIBC_2.3.4 xdr_u_quad_t F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 __confstr_chk F
>  GLIBC_2.4 __fgets_chk F
> diff --git a/sysdeps/mach/hurd/spawni.c b/sysdeps/mach/hurd/spawni.c
> index e8024a2679..720e56ffb3 100644
> --- a/sysdeps/mach/hurd/spawni.c
> +++ b/sysdeps/mach/hurd/spawni.c
> @@ -597,6 +597,10 @@ __spawni (pid_t *pid, const char *file,
>  	  case spawn_do_fchdir:
>  	    err = child_fchdir (action->action.fchdir_action.fd);
>  	    break;
> +
> +	  case spawn_do_closefrom:
> +	    err = EINVAL;
> +	    break;
>  	  }
>  
>  	if (err)
> diff --git a/sysdeps/posix/spawni.c b/sysdeps/posix/spawni.c
> index a5913feb14..3beaba91db 100644
> --- a/sysdeps/posix/spawni.c
> +++ b/sysdeps/posix/spawni.c
> @@ -231,6 +231,10 @@ __spawni_child (void *arguments)
>  	      if (__fchdir (action->action.fchdir_action.fd) != 0)
>  		goto fail;
>  	      break;
> +
> +	    case spawn_do_closefrom:
> +	      __set_errno (EINVAL);
> +	      goto fail;
>  	    }
>  	}
>      }
> diff --git a/sysdeps/unix/sysv/linux/aarch64/libc.abilist b/sysdeps/unix/sysv/linux/aarch64/libc.abilist
> index a4c31932cb..e1e793b348 100644
> --- a/sysdeps/unix/sysv/linux/aarch64/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/aarch64/libc.abilist
> @@ -2143,5 +2143,6 @@ GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
>  GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
> diff --git a/sysdeps/unix/sysv/linux/alpha/libc.abilist b/sysdeps/unix/sysv/linux/alpha/libc.abilist
> index fe85a35620..735e54f433 100644
> --- a/sysdeps/unix/sysv/linux/alpha/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/alpha/libc.abilist
> @@ -2218,6 +2218,7 @@ GLIBC_2.30 __nldbl_warn F
>  GLIBC_2.30 __nldbl_warnx F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 _IO_fprintf F
> diff --git a/sysdeps/unix/sysv/linux/arm/libc.abilist b/sysdeps/unix/sysv/linux/arm/libc.abilist
> index bc3df8dcea..a3b9db6efa 100644
> --- a/sysdeps/unix/sysv/linux/arm/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/arm/libc.abilist
> @@ -128,6 +128,7 @@ GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
>  GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 _Exit F
> diff --git a/sysdeps/unix/sysv/linux/csky/libc.abilist b/sysdeps/unix/sysv/linux/csky/libc.abilist
> index 9b3cee65bb..88d112b9b1 100644
> --- a/sysdeps/unix/sysv/linux/csky/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/csky/libc.abilist
> @@ -2087,5 +2087,6 @@ GLIBC_2.29 xprt_register F
>  GLIBC_2.29 xprt_unregister F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
> diff --git a/sysdeps/unix/sysv/linux/hppa/libc.abilist b/sysdeps/unix/sysv/linux/hppa/libc.abilist
> index 75edece94a..2925c96183 100644
> --- a/sysdeps/unix/sysv/linux/hppa/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/hppa/libc.abilist
> @@ -2039,6 +2039,7 @@ GLIBC_2.3.4 xdr_quad_t F
>  GLIBC_2.3.4 xdr_u_quad_t F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 __confstr_chk F
> diff --git a/sysdeps/unix/sysv/linux/i386/libc.abilist b/sysdeps/unix/sysv/linux/i386/libc.abilist
> index edeaf8e722..9fd8ceb639 100644
> --- a/sysdeps/unix/sysv/linux/i386/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/i386/libc.abilist
> @@ -2205,6 +2205,7 @@ GLIBC_2.3.4 xdr_quad_t F
>  GLIBC_2.3.4 xdr_u_quad_t F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 __confstr_chk F
> diff --git a/sysdeps/unix/sysv/linux/ia64/libc.abilist b/sysdeps/unix/sysv/linux/ia64/libc.abilist
> index b5d460eeb2..37d817eeb3 100644
> --- a/sysdeps/unix/sysv/linux/ia64/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/ia64/libc.abilist
> @@ -2071,6 +2071,7 @@ GLIBC_2.3.4 xdr_quad_t F
>  GLIBC_2.3.4 xdr_u_quad_t F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 __confstr_chk F
> diff --git a/sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist b/sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist
> index 05633b3cb8..e81ab1f0bf 100644
> --- a/sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/m68k/coldfire/libc.abilist
> @@ -129,6 +129,7 @@ GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
>  GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 _Exit F
> diff --git a/sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist b/sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist
> index 47eb7b4608..cd5742bf63 100644
> --- a/sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/m68k/m680x0/libc.abilist
> @@ -2148,6 +2148,7 @@ GLIBC_2.3.4 xdr_quad_t F
>  GLIBC_2.3.4 xdr_u_quad_t F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 __confstr_chk F
> diff --git a/sysdeps/unix/sysv/linux/microblaze/libc.abilist b/sysdeps/unix/sysv/linux/microblaze/libc.abilist
> index f7ced487f7..957b14d992 100644
> --- a/sysdeps/unix/sysv/linux/microblaze/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/microblaze/libc.abilist
> @@ -2135,5 +2135,6 @@ GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
>  GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
> diff --git a/sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist b/sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist
> index e49dc4272e..b8ffdea448 100644
> --- a/sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/mips/mips32/fpu/libc.abilist
> @@ -2122,6 +2122,7 @@ GLIBC_2.3.4 xdr_quad_t F
>  GLIBC_2.3.4 xdr_u_quad_t F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 __confstr_chk F
> diff --git a/sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist b/sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist
> index daa3b60c5b..e1c861720a 100644
> --- a/sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/mips/mips32/nofpu/libc.abilist
> @@ -2120,6 +2120,7 @@ GLIBC_2.3.4 xdr_quad_t F
>  GLIBC_2.3.4 xdr_u_quad_t F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 __confstr_chk F
> diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist b/sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist
> index 457ce0b6f2..88fe3f4d26 100644
> --- a/sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/mips/mips64/n32/libc.abilist
> @@ -2128,6 +2128,7 @@ GLIBC_2.3.4 xdr_quad_t F
>  GLIBC_2.3.4 xdr_u_quad_t F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 __confstr_chk F
> diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist b/sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist
> index 63d5c03bfb..7c6dafa818 100644
> --- a/sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/mips/mips64/n64/libc.abilist
> @@ -2122,6 +2122,7 @@ GLIBC_2.3.4 xdr_quad_t F
>  GLIBC_2.3.4 xdr_u_quad_t F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 __confstr_chk F
> diff --git a/sysdeps/unix/sysv/linux/nios2/libc.abilist b/sysdeps/unix/sysv/linux/nios2/libc.abilist
> index 7fec0c9670..487b005070 100644
> --- a/sysdeps/unix/sysv/linux/nios2/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/nios2/libc.abilist
> @@ -2176,5 +2176,6 @@ GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
>  GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
> diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist
> index 9200a54309..db81db978a 100644
> --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/fpu/libc.abilist
> @@ -2178,6 +2178,7 @@ GLIBC_2.30 __nldbl_warn F
>  GLIBC_2.30 __nldbl_warnx F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 _IO_fprintf F
> diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libc.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libc.abilist
> index 2860df8ebc..06dfdf1fed 100644
> --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/be/libc.abilist
> @@ -2041,6 +2041,7 @@ GLIBC_2.30 __nldbl_warn F
>  GLIBC_2.30 __nldbl_warnx F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 _IO_fprintf F
> diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libc.abilist b/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libc.abilist
> index 2229a1dcc0..eb0532937e 100644
> --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/libc.abilist
> @@ -2245,5 +2245,6 @@ GLIBC_2.30 __nldbl_warn F
>  GLIBC_2.30 __nldbl_warnx F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
> diff --git a/sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist b/sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist
> index 31010e6cf7..4985fa93e3 100644
> --- a/sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/riscv/rv64/libc.abilist
> @@ -2105,5 +2105,6 @@ GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
>  GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
> diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist b/sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist
> index 576295deff..1ef1b9d4cc 100644
> --- a/sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/s390/s390-32/libc.abilist
> @@ -2173,6 +2173,7 @@ GLIBC_2.30 __nldbl_warn F
>  GLIBC_2.30 __nldbl_warnx F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 _IO_fprintf F
> diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist b/sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist
> index abf0473683..f9d9fe68ca 100644
> --- a/sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/s390/s390-64/libc.abilist
> @@ -2077,6 +2077,7 @@ GLIBC_2.30 __nldbl_warn F
>  GLIBC_2.30 __nldbl_warnx F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 _IO_fprintf F
> diff --git a/sysdeps/unix/sysv/linux/sh/libc.abilist b/sysdeps/unix/sysv/linux/sh/libc.abilist
> index 41977f6e9c..1b12384dd1 100644
> --- a/sysdeps/unix/sysv/linux/sh/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/sh/libc.abilist
> @@ -2043,6 +2043,7 @@ GLIBC_2.3.4 xdr_quad_t F
>  GLIBC_2.3.4 xdr_u_quad_t F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 __confstr_chk F
> diff --git a/sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist b/sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist
> index 3d2f00ca52..a7c244cb56 100644
> --- a/sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/sparc/sparc32/libc.abilist
> @@ -2167,6 +2167,7 @@ GLIBC_2.30 __nldbl_warn F
>  GLIBC_2.30 __nldbl_warnx F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 _IO_fprintf F
> diff --git a/sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist b/sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist
> index 2f20643e8e..a71facfb43 100644
> --- a/sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/sparc/sparc64/libc.abilist
> @@ -2094,6 +2094,7 @@ GLIBC_2.3.4 xdr_quad_t F
>  GLIBC_2.3.4 xdr_u_quad_t F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 __confstr_chk F
> diff --git a/sysdeps/unix/sysv/linux/spawn_int_abi.h b/sysdeps/unix/sysv/linux/spawn_int_abi.h
> new file mode 100644
> index 0000000000..9c4b31ccae
> --- /dev/null
> +++ b/sysdeps/unix/sysv/linux/spawn_int_abi.h
> @@ -0,0 +1,25 @@
> +/* Internal ABI specific for posix_spawn functionality.  Linux version.
> +   Copyright (C) 2019 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <http://www.gnu.org/licenses/>.  */
> +
> +#ifndef _SPAWN_INT_ABI_H
> +#define _SPAWN_INT_ABI_H
> +
> +/* spawni.c implements closefrom by interacting over /proc/self/fd.  */
> +#define __SPAWN_SUPPORT_CLOSEFROM 1
> +
> +#endif /* _SPAWN_INT_H */
> diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
> index c1abf3f960..ca7bf99825 100644
> --- a/sysdeps/unix/sysv/linux/spawni.c
> +++ b/sysdeps/unix/sysv/linux/spawni.c
> @@ -17,20 +17,16 @@
>     <http://www.gnu.org/licenses/>.  */
>  
>  #include <spawn.h>
> -#include <fcntl.h>
>  #include <paths.h>
> -#include <string.h>
> +#include <dirent.h>
>  #include <sys/resource.h>
> -#include <sys/wait.h>
> -#include <sys/param.h>
> -#include <sys/mman.h>
>  #include <not-cancel.h>
>  #include <local-setxid.h>
>  #include <shlib-compat.h>
> -#include <nptl/pthreadP.h>
> -#include <dl-sysdep.h>
> -#include <libc-pointer-arith.h>
> +#include <sigsetops.h>
> +#include <internal-signals.h>
>  #include <ldsodefs.h>
> +#include <ctype.h>
>  #include "spawn_int.h"
>  
>  /* The Linux implementation of posix_spawn{p} uses the clone syscall directly
> @@ -114,6 +110,44 @@ maybe_script_execute (struct posix_spawn_args *args)
>      }
>  }
>  
> +/* Close all file descriptor up to FROM by interacting /proc/self/fd.  */
> +static bool
> +spawn_closefrom (int from)
> +{
> +  struct dirent64 entries[1024 / sizeof (struct dirent64)];
> +
> +  int dirfd = __open ("/proc/self/fd", O_RDONLY | O_DIRECTORY, 0);
> +  if (dirfd == -1)
> +    return false;
> +
> +  ssize_t r;
> +  while ((r = __getdents64 (dirfd, entries, sizeof (entries))) > 0)
> +    {
> +      struct dirent64 *dp = entries;
> +      struct dirent64 *edp = (void *)((uintptr_t) dp + r);
> +
> +      for (struct dirent64 *dp = entries; dp < edp;
> +	   dp = (void *)((uintptr_t) dp + dp->d_reclen))
> +	{
> +	  int fd = 0;
> +
> +	  if (dp->d_name[0] == '.')
> +	    continue;
> +
> +	  for (const char *s = dp->d_name; isdigit (*s); s++)
> +	    fd = 10 * fd + (*s - '0');
> +
> +	  if (fd == dirfd || fd < from)
> +	    continue;
> +
> +	  __close_nocancel (fd);
> +	}
> +    }
> +
> +  __close_nocancel (dirfd);
> +  return true;
> +}
> +
>  /* Function used in the clone call to setup the signals mask, posix_spawn
>     attributes, and file actions.  It run on its own stack (provided by the
>     posix_spawn call).  */
> @@ -280,6 +314,11 @@ __spawni_child (void *arguments)
>  	      if (__fchdir (action->action.fchdir_action.fd) != 0)
>  		goto fail;
>  	      break;
> +
> +	    case spawn_do_closefrom:
> +	      if (!spawn_closefrom (action->action.closefrom_action.from))
> +		goto fail;
> +	      break;
>  	    }
>  	}
>      }
> @@ -339,12 +378,13 @@ __spawnix (pid_t * pid, const char *file,
>    int prot = (PROT_READ | PROT_WRITE
>  	     | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
>  
> -  /* Add a slack area for child's stack.  */
> -  size_t argv_size = (argc * sizeof (void *)) + 512;
> +  size_t argv_size = (argc * sizeof (void *));
>    /* We need at least a few pages in case the compiler's stack checking is
>       enabled.  In some configs, it is known to use at least 24KiB.  We use
>       32KiB to be "safe" from anything the compiler might do.  Besides, the
> -     extra pages won't actually be allocated unless they get used.  */
> +     extra pages won't actually be allocated unless they get used.
> +     It also acts the slack for spawn_closefrom (including MIPS64 getdents64
> +     where it might use about 1k extra stack space.  */
>    argv_size += (32 * 1024);
>    size_t stack_size = ALIGN_UP (argv_size, GLRO(dl_pagesize));
>    void *stack = __mmap (NULL, stack_size, prot,
> diff --git a/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist b/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist
> index 59f85d9373..78a43f5851 100644
> --- a/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/x86_64/64/libc.abilist
> @@ -2052,6 +2052,7 @@ GLIBC_2.3.4 xdr_quad_t F
>  GLIBC_2.3.4 xdr_u_quad_t F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
>  GLIBC_2.4 __confstr_chk F
> diff --git a/sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist b/sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist
> index 67a4e238d6..b83897ddbf 100644
> --- a/sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist
> +++ b/sysdeps/unix/sysv/linux/x86_64/x32/libc.abilist
> @@ -2151,5 +2151,6 @@ GLIBC_2.29 posix_spawn_file_actions_addchdir_np F
>  GLIBC_2.29 posix_spawn_file_actions_addfchdir_np F
>  GLIBC_2.30 getdents64 F
>  GLIBC_2.30 gettid F
> +GLIBC_2.30 posix_spawn_file_actions_addclosefrom_np F
>  GLIBC_2.30 tgkill F
>  GLIBC_2.30 twalk_r F
> 

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-07-31 18:31 [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback Adhemerval Zanella
                   ` (3 preceding siblings ...)
  2019-07-31 18:31 ` [PATCH v2 5/5] posix: Use posix_spawn for wordexp Adhemerval Zanella
@ 2019-08-28 14:09 ` Adhemerval Zanella
  2019-08-28 14:35 ` Andreas Schwab
                   ` (2 subsequent siblings)
  7 siblings, 0 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-08-28 14:09 UTC (permalink / raw)
  To: libc-alpha

Ping.

On 31/07/2019 15:31, Adhemerval Zanella wrote:
> This patch changes how the fallback getdents64 implementation calls
> non-LFS getdents by replacing the scratch_buffer with static buffer
> plus a loop on getdents calls.  This avoids the potential malloc
> call on scratch_buffer_set_array_size for large input buffer size
> at the cost of more getdents syscalls.
> 
> It also adds a small optimization for older kernels, where the first
> ENOSYS failure for getdents64 disable subsequent calls.
> 
> Check the dirent tests on a mips64-linux-gnu with getdents64 code
> disabled.
> 
> 	* sysdeps/unix/sysv/linux/mips/mips64/getdents64.c (__getdents64):
> 	Add small optimization for older kernel to avoid issuing
> 	__NR_getdents64 on each call and replace scratch_buffer usage with
> 	a static allocated buffer.
> ---
>  .../unix/sysv/linux/mips/mips64/getdents64.c  | 122 ++++++++----------
>  1 file changed, 54 insertions(+), 68 deletions(-)
> 
> diff --git a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
> index 8bf3abb0e0..3b5afd9324 100644
> --- a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
> +++ b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
> @@ -22,98 +22,84 @@
>  #include <assert.h>
>  #include <sys/param.h>
>  #include <unistd.h>
> -#include <scratch_buffer.h>
>  #include <limits.h>
>  
>  ssize_t
> -__getdents64 (int fd, void *buf0, size_t nbytes)
> +__getdents64 (int fd, void *buf, size_t nbytes)
>  {
> -  char *buf = buf0;
> -
>    /* The system call takes an unsigned int argument, and some length
>       checks in the kernel use an int type.  */
>    if (nbytes > INT_MAX)
>      nbytes = INT_MAX;
>  
>  #ifdef __NR_getdents64
> -  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
> -  if (ret != -1)
> -    return ret;
> +  static bool getdents64_supportted = true;
> +  if (atomic_load_relaxed (&getdents64_supportted))
> +    {
> +      ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
> +      if (ret >= 0 || errno != ENOSYS)
> +	return ret;
> +
> +      atomic_store_relaxed (&getdents64_supportted, false);
> +    }
>  #endif
>  
>    /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.
> -     If syscall is not available it need to fallback to non-LFS one.  */
> +     If the syscall is not available it need to fallback to the non-LFS one.
> +     Also to avoid an unbounded allocation through VLA/alloca or malloc (which
> +     would make the syscall non async-signal-safe) it uses a limited buffer.
> +     This is sub-optimal for large NBYTES, however this is a fallback
> +     mechanism to emulate a syscall that kernel should provide.   */
>  
> +  enum { KBUF_SIZE = 1024 };
>    struct kernel_dirent
> -    {
> -      unsigned long d_ino;
> -      unsigned long d_off;
> -      unsigned short int d_reclen;
> -      char d_name[256];
> -    };
> -
> -  const size_t size_diff = (offsetof (struct dirent64, d_name)
> -			   - offsetof (struct kernel_dirent, d_name));
> -
> -  size_t red_nbytes = MIN (nbytes
> -			   - ((nbytes / (offsetof (struct dirent64, d_name)
> -					 + 14)) * size_diff),
> -			   nbytes - size_diff);
> -
> -  struct scratch_buffer tmpbuf;
> -  scratch_buffer_init (&tmpbuf);
> -  if (!scratch_buffer_set_array_size (&tmpbuf, red_nbytes, sizeof (uint8_t)))
> -    INLINE_SYSCALL_ERROR_RETURN_VALUE (ENOMEM);
> -
> -  struct kernel_dirent *skdp, *kdp;
> -  skdp = kdp = tmpbuf.data;
> -
> -  ssize_t retval = INLINE_SYSCALL_CALL (getdents, fd, kdp, red_nbytes);
> -  if (retval == -1)
> -    {
> -      scratch_buffer_free (&tmpbuf);
> -      return -1;
> -    }
> +  {
> +    unsigned long d_ino;
> +    unsigned long d_off;
> +    unsigned short int d_reclen;
> +    char d_name[1];
> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
>  
> -  off64_t last_offset = -1;
>    struct dirent64 *dp = (struct dirent64 *) buf;
> -  while ((char *) kdp < (char *) skdp + retval)
> +
> +  size_t nb = 0;
> +  off64_t last_offset = -1;
> +
> +  ssize_t r;
> +  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)
>      {
> -      const size_t alignment = _Alignof (struct dirent64);
> -      /* Since kdp->d_reclen is already aligned for the kernel structure
> -	 this may compute a value that is bigger than necessary.  */
> -      size_t new_reclen = ((kdp->d_reclen + size_diff + alignment - 1)
> -			   & ~(alignment - 1));
> -      if ((char *) dp + new_reclen > buf + nbytes)
> -        {
> -	  /* Our heuristic failed.  We read too many entries.  Reset
> -	     the stream.  */
> -	  assert (last_offset != -1);
> -	  __lseek64 (fd, last_offset, SEEK_SET);
> -
> -	  if ((char *) dp == buf)
> +      struct kernel_dirent *skdp, *kdp;
> +      skdp = kdp = kbuf;
> +
> +      while ((char *) kdp < (char *) skdp + r)
> +	{
> +	  const size_t alignment = _Alignof (struct dirent64);
> +	  size_t new_reclen = ((kdp->d_reclen + alignment - 1)
> +			      & ~(alignment - 1));
> +	  if (nb + new_reclen > nbytes)
>  	    {
> -	      scratch_buffer_free (&tmpbuf);
> -	      return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
> +		/* The new entry will overflow the input buffer, rewind to
> +		   last obtained entry and return.  */
> +	       __lseek64 (fd, last_offset, SEEK_SET);
> +	       goto out;
>  	    }
> +	  nb += new_reclen;
>  
> -	  break;
> -	}
> -
> -      last_offset = kdp->d_off;
> -      dp->d_ino = kdp->d_ino;
> -      dp->d_off = kdp->d_off;
> -      dp->d_reclen = new_reclen;
> -      dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
> -      memcpy (dp->d_name, kdp->d_name,
> -	      kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
> +	  dp->d_ino = kdp->d_ino;
> +	  dp->d_off = last_offset = kdp->d_off;
> +	  dp->d_reclen = new_reclen;
> +	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
> +	  memcpy (dp->d_name, kdp->d_name,
> +		  kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
>  
> -      dp = (struct dirent64 *) ((char *) dp + new_reclen);
> -      kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
> +	  dp = (struct dirent64 *) ((char *) dp + new_reclen);
> +	  kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
> +	}
>      }
>  
> -  scratch_buffer_free (&tmpbuf);
> -  return (char *) dp - buf;
> +out:
> +  return (char *) dp - (char *) buf;
>  }
>  libc_hidden_def (__getdents64)
>  weak_alias (__getdents64, getdents64)
> 

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 5/5] posix: Use posix_spawn for wordexp
  2019-07-31 18:31 ` [PATCH v2 5/5] posix: Use posix_spawn for wordexp Adhemerval Zanella
@ 2019-08-28 14:10   ` Adhemerval Zanella
  2019-10-07 17:51     ` Adhemerval Zanella
  2019-10-07 19:33   ` Florian Weimer
  1 sibling, 1 reply; 59+ messages in thread
From: Adhemerval Zanella @ 2019-08-28 14:10 UTC (permalink / raw)
  To: libc-alpha

Ping.

On 31/07/2019 15:31, Adhemerval Zanella wrote:
> Change from previous version:
> 
>   - Use libsupport and remove atfork usage on posix/wordexp-test.c.
> 
> --
> 
> This patch replaces the fork+exec by posix_spawn on wordexp, which
> allows a better scability on Linux and simplifies the thread
> cancellation handling.
> 
> The only change which can not be implemented with posix_spawn the
> /dev/null check to certify it is indeed the expected device.  I am
> not sure how effetive this check is since /dev/null tampering means
> something very wrong with the system and this is the least of the
> issues.  My view is the tests is really out of the place and the
> hardening provided is minimum.
> 
> If the idea is still to provide such check, I think a possibilty
> would be to open /dev/null, check it, add a dup2 file action, and
> close the file descriptor.
> 
> Checked on powerpc64le-linux-gnu and x86_64-linux-gnu.
> 
> 	* include/spawn.h (__posix_spawn_file_actions_addopen): New
> 	prototype.
> 	* posix/spawn_faction_addopen.c (posix_spawn_file_actions_addopen):
> 	Add internal alias.
> 	* posix/wordexp.c (create_environment, free_environment): New
> 	functions.
> 	(exec_comm_child, exec_comm): Use posix_spawn instead of fork+exec.
> 	* posix/wordexp-test.c: Use libsupport and remove atfork usage.
> ---
>  include/spawn.h               |   3 +
>  posix/spawn_faction_addopen.c |   8 +-
>  posix/wordexp-test.c          | 142 +++++++++--------------------
>  posix/wordexp.c               | 167 ++++++++++++++++------------------
>  4 files changed, 129 insertions(+), 191 deletions(-)
> 
> diff --git a/include/spawn.h b/include/spawn.h
> index 7fdd965bd7..4a0b1849da 100644
> --- a/include/spawn.h
> +++ b/include/spawn.h
> @@ -11,6 +11,9 @@ __typeof (posix_spawn_file_actions_addclose)
>  __typeof (posix_spawn_file_actions_adddup2)
>    __posix_spawn_file_actions_adddup2 attribute_hidden;
>  
> +__typeof (posix_spawn_file_actions_addopen)
> +  __posix_spawn_file_actions_addopen attribute_hidden;
> +
>  __typeof (posix_spawn_file_actions_destroy)
>    __posix_spawn_file_actions_destroy attribute_hidden;
>  
> diff --git a/posix/spawn_faction_addopen.c b/posix/spawn_faction_addopen.c
> index 742eb9526d..2e598de300 100644
> --- a/posix/spawn_faction_addopen.c
> +++ b/posix/spawn_faction_addopen.c
> @@ -25,9 +25,9 @@
>  /* Add an action to FILE-ACTIONS which tells the implementation to call
>     `open' for the given file during the `spawn' call.  */
>  int
> -posix_spawn_file_actions_addopen (posix_spawn_file_actions_t *file_actions,
> -				  int fd, const char *path, int oflag,
> -				  mode_t mode)
> +__posix_spawn_file_actions_addopen (posix_spawn_file_actions_t *file_actions,
> +				    int fd, const char *path, int oflag,
> +				    mode_t mode)
>  {
>    struct __spawn_action *rec;
>  
> @@ -60,3 +60,5 @@ posix_spawn_file_actions_addopen (posix_spawn_file_actions_t *file_actions,
>  
>    return 0;
>  }
> +weak_alias (__posix_spawn_file_actions_addopen,
> +	    posix_spawn_file_actions_addopen)
> diff --git a/posix/wordexp-test.c b/posix/wordexp-test.c
> index 10a0768a6b..ef780b0a65 100644
> --- a/posix/wordexp-test.c
> +++ b/posix/wordexp-test.c
> @@ -15,39 +15,21 @@
>     License along with the GNU C Library; if not, see
>     <http://www.gnu.org/licenses/>.  */
>  
> -#include <sys/stat.h>
> -#include <sys/types.h>
> -#include <sys/mman.h>
> +#include <wordexp.h>
> +#include <stdio.h>
>  #include <fcntl.h>
> -#include <unistd.h>
>  #include <pwd.h>
> -#include <stdio.h>
> -#include <stdint.h>
>  #include <stdlib.h>
>  #include <string.h>
> -#include <wordexp.h>
> +#include <sys/mman.h>
> +
>  #include <libc-pointer-arith.h>
> -#include <dso_handle.h>
> +#include <array_length.h>
> +#include <support/xunistd.h>
> +#include <support/check.h>
>  
>  #define IFS " \n\t"
>  
> -extern int __register_atfork (void (*) (void), void (*) (void), void (*) (void), void *);
> -
> -static int __app_register_atfork (void (*prepare) (void), void (*parent) (void), void (*child) (void))
> -{
> -  return __register_atfork (prepare, parent, child, __dso_handle);
> -}
> -
> -/* Number of forks seen.  */
> -static int registered_forks;
> -
> -/* For each fork increment the fork count.  */
> -static void
> -register_fork (void)
> -{
> -  registered_forks++;
> -}
> -
>  struct test_case_struct
>  {
>    int retval;
> @@ -57,7 +39,7 @@ struct test_case_struct
>    size_t wordc;
>    const char *wordv[10];
>    const char *ifs;
> -} test_case[] =
> +} static test_case[] =
>    {
>      /* Simple word- and field-splitting */
>      { 0, NULL, "one", 0, 1, { "one", }, IFS },
> @@ -238,8 +220,6 @@ struct test_case_struct
>      { WRDE_SYNTAX, NULL, "${", 0, 0, { NULL, }, IFS },      /* BZ 18043  */
>      { WRDE_SYNTAX, NULL, "L${a:", 0, 0, { NULL, }, IFS },   /* BZ 18043#c4  */
>      { WRDE_SYNTAX, NULL, "$[1/0]", WRDE_NOCMD, 0, {NULL, }, IFS }, /* BZ 18100 */
> -
> -    { -1, NULL, NULL, 0, 0, { NULL, }, IFS },
>    };
>  
>  static int testit (struct test_case_struct *tc);
> @@ -256,16 +236,14 @@ command_line_test (const char *words)
>      printf ("we_wordv[%d] = \"%s\"\n", i, we.we_wordv[i]);
>  }
>  
> -int
> -main (int argc, char *argv[])
> +static int
> +do_test (int argc, char *argv[])
>  {
> -  const char *globfile[] = { "one", "two", "three", NULL };
> +  const char *globfile[] = { "one", "two", "three" };
>    char tmpdir[32];
>    struct passwd *pw;
>    const char *cwd;
>    int test;
> -  int fail = 0;
> -  int i;
>    struct test_case_struct ts;
>  
>    if (argc > 1)
> @@ -278,30 +256,18 @@ main (int argc, char *argv[])
>  
>    /* Set up arena for pathname expansion */
>    tmpnam (tmpdir);
> -  if (mkdir (tmpdir, S_IRWXU) || chdir (tmpdir))
> -    return -1;
> -  else
> -    {
> -      int fd;
> +  xmkdir (tmpdir, S_IRWXU);
> +  TEST_VERIFY_EXIT (chdir (tmpdir) == 0);
>  
> -      for (i = 0; globfile[i]; ++i)
> -	if ((fd = creat (globfile[i], S_IRUSR | S_IWUSR)) == -1
> -	    || close (fd))
> -	  return -1;
> -    }
> -
> -  /* If we are not allowed to do command substitution, we install
> -     fork handlers to verify that no forks happened.  No forks should
> -     happen at all if command substitution is disabled.  */
> -  if (__app_register_atfork (register_fork, NULL, NULL) != 0)
> +  for (int i = 0; i < array_length (globfile); ++i)
>      {
> -      printf ("Failed to register fork handler.\n");
> -      return -1;
> +      int fd = xopen (globfile[i], O_WRONLY|O_CREAT|O_TRUNC,
> +		      S_IRUSR | S_IWUSR);
> +      xclose (fd);
>      }
>  
> -  for (test = 0; test_case[test].retval != -1; test++)
> -    if (testit (&test_case[test]))
> -      ++fail;
> +  for (test = 0; test < array_length (test_case); test++)
> +    TEST_COMPARE (testit (&test_case[test]), 0);
>  
>    /* Tilde-expansion tests. */
>    pw = getpwnam ("root");
> @@ -315,8 +281,7 @@ main (int argc, char *argv[])
>        ts.wordv[0] = pw->pw_dir;
>        ts.ifs = IFS;
>  
> -      if (testit (&ts))
> -	++fail;
> +      TEST_COMPARE (testit (&ts), 0);
>  
>        ts.retval = 0;
>        ts.env = pw->pw_dir;
> @@ -326,8 +291,7 @@ main (int argc, char *argv[])
>        ts.wordv[0] = "x";
>        ts.ifs = IFS;
>  
> -      if (testit (&ts))
> -	++fail;
> +      TEST_COMPARE (testit (&ts), 0);
>      }
>  
>    /* "~" expands to value of $HOME when HOME is set */
> @@ -342,8 +306,7 @@ main (int argc, char *argv[])
>    ts.wordv[1] = "/dummy/home/foo";
>    ts.ifs = IFS;
>  
> -  if (testit (&ts))
> -    ++fail;
> +  TEST_COMPARE (testit (&ts), 0);
>  
>    /* "~" expands to home dir from passwd file if HOME is not set */
>  
> @@ -359,8 +322,7 @@ main (int argc, char *argv[])
>        ts.wordv[0] = pw->pw_dir;
>        ts.ifs = IFS;
>  
> -      if (testit (&ts))
> -	++fail;
> +      TEST_COMPARE (testit (&ts), 0);
>      }
>  
>    /* Integer overflow in division.  */
> @@ -375,37 +337,32 @@ main (int argc, char *argv[])
>        "18446744073709551616",
>        "170141183460469231731687303715884105728",
>        "340282366920938463463374607431768211456",
> -      NULL
>      };
>  
> -    for (const char *const *num = numbers; *num; ++num)
> +    for (int i = 0; i < array_length (numbers); i++)
>        {
>  	wordexp_t p;
>  	char pattern[256];
> -	snprintf (pattern, sizeof (pattern), "$[(-%s)/(-1)]", *num);
> +	snprintf (pattern, sizeof (pattern), "$[(-%s)/(-1)]", numbers[i]);
>  	int ret = wordexp (pattern, &p, WRDE_NOCMD);
>  	if (ret == 0)
>  	  {
> -	    if (p.we_wordc != 1 || strcmp (p.we_wordv[0], *num) != 0)
> -	      {
> -		printf ("Integer overflow for \"%s\" failed", pattern);
> -		++fail;
> -	      }
> +	    TEST_COMPARE (p.we_wordc, 1);
> +	    TEST_COMPARE (strcmp (p.we_wordv[0], numbers[i]), 0);
>  	    wordfree (&p);
>  	  }
> -	else if (ret != WRDE_SYNTAX)
> +	else
>  	  {
> -	    printf ("Integer overflow for \"%s\" failed with %d",
> -		    pattern, ret);
> -	    ++fail;
> +	    TEST_COMPARE (ret, WRDE_SYNTAX);
> +	    if (ret != WRDE_SYNTAX)
> +	      printf ("Integer overflow for \"%s\" failed with %d",
> +		      pattern, ret);
>  	  }
>        }
>    }
>  
> -  puts ("tests completed, now cleaning up");
> -
>    /* Clean up */
> -  for (i = 0; globfile[i]; ++i)
> +  for (int i = 0; i < array_length (globfile); ++i)
>      remove (globfile[i]);
>  
>    if (cwd == NULL)
> @@ -414,26 +371,17 @@ main (int argc, char *argv[])
>    chdir (cwd);
>    rmdir (tmpdir);
>  
> -  printf ("tests failed: %d\n", fail);
> -
> -  return fail != 0;
> +  return 0;
>  }
>  
>  static const char *
>  at_page_end (const char *words)
>  {
>    const int pagesize = getpagesize ();
> -  char *start = mmap (0, 2 * pagesize, PROT_READ|PROT_WRITE,
> -		      MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
> +  char *start = xmmap (0, 2 * pagesize, PROT_READ | PROT_WRITE,
> +		       MAP_PRIVATE | MAP_ANONYMOUS, -1);
>  
> -  if (start == MAP_FAILED)
> -    return start;
> -
> -  if (mprotect (start + pagesize, pagesize, PROT_NONE))
> -    {
> -      munmap (start, 2 * pagesize);
> -      return MAP_FAILED;
> -    }
> +  xmprotect (start + pagesize, pagesize, PROT_NONE);
>  
>    /* Includes terminating NUL.  */
>    const size_t words_size = strlen (words) + 1;
> @@ -472,9 +420,6 @@ testit (struct test_case_struct *tc)
>    fflush (NULL);
>    const char *words = at_page_end (tc->words);
>  
> -  if (tc->flags & WRDE_NOCMD)
> -    registered_forks = 0;
> -
>    if (tc->flags & WRDE_APPEND)
>      {
>        /* initial wordexp() call, to be appended to */
> @@ -486,13 +431,6 @@ testit (struct test_case_struct *tc)
>      }
>    retval = wordexp (words, &we, tc->flags);
>  
> -  if ((tc->flags & WRDE_NOCMD)
> -      && (registered_forks > 0))
> -    {
> -	  printf ("FAILED fork called for WRDE_NOCMD\n");
> -	  return 1;
> -    }
> -
>    if (tc->flags & WRDE_DOOFFS)
>        start_offs = sav_we.we_offs;
>  
> @@ -551,9 +489,11 @@ testit (struct test_case_struct *tc)
>    const int page_size = getpagesize ();
>    char *start = (char *) PTR_ALIGN_DOWN (words, page_size);
>  
> -  if (munmap (start, 2 * page_size) != 0)
> -    return 1;
> +  xmunmap (start, 2 * page_size);
>  
>    fflush (NULL);
>    return bzzzt;
>  }
> +
> +#define TEST_FUNCTION_ARGV do_test
> +#include <support/test-driver.c>
> diff --git a/posix/wordexp.c b/posix/wordexp.c
> index 22c6d18a9c..e1aafcaceb 100644
> --- a/posix/wordexp.c
> +++ b/posix/wordexp.c
> @@ -25,33 +25,18 @@
>  #include <libintl.h>
>  #include <paths.h>
>  #include <pwd.h>
> -#include <signal.h>
>  #include <stdbool.h>
>  #include <stdio.h>
> -#include <stdlib.h>
>  #include <string.h>
>  #include <sys/param.h>
> -#include <sys/stat.h>
> -#include <sys/time.h>
> -#include <sys/types.h>
> -#include <sys/types.h>
>  #include <sys/wait.h>
>  #include <unistd.h>
> -#include <wchar.h>
>  #include <wordexp.h>
> -#include <kernel-features.h>
> +#include <spawn.h>
>  #include <scratch_buffer.h>
> -
> -#include <libc-lock.h>
>  #include <_itoa.h>
> -
> -/* Undefine the following line for the production version.  */
> -/* #define NDEBUG 1 */
>  #include <assert.h>
>  
> -/* Get some device information.  */
> -#include <device-nrs.h>
> -
>  /*
>   * This is a recursive-descent-style word expansion routine.
>   */
> @@ -812,61 +797,90 @@ parse_arith (char **word, size_t *word_length, size_t *max_length,
>    return WRDE_SYNTAX;
>  }
>  
> +static char **
> +create_environment (void)
> +{
> +  size_t s = 0;
> +
> +  /* Calculate total environment size, including 'IFS' if is present.  */
> +  for (char **ep = __environ; *ep != NULL; ep++, s++);
> +
> +  /* Include final NULL pointer.  */
> +  char **newenviron = malloc (s * sizeof (char*));
> +  if (newenviron == NULL)
> +    return NULL;
> +
> +  /* Copy current environment excluding 'IFS', to make sure the subshell
> +     doesn't field-split on our behalf. */
> +  size_t i, j;
> +  for (i = 0, j = 0; i < s; i++)
> +    if (strncmp (__environ[i], "IFS=", sizeof ("IFS=")-1) != 0)
> +      newenviron[j++] = __strdup (__environ[i]);
> +  newenviron[j] = NULL;
> +
> +  return newenviron;
> +}
> +
> +static void
> +free_environment (char **environ)
> +{
> +  for (char **ep = environ; *ep != NULL; ep++)
> +    free (*ep);
> +  free (environ);
> +}
> +
>  /* Function called by child process in exec_comm() */
> -static inline void
> -__attribute__ ((always_inline))
> -exec_comm_child (char *comm, int *fildes, int showerr, int noexec)
> +static pid_t
> +exec_comm_child (char *comm, int *fildes, bool showerr, bool noexec)
>  {
> -  const char *args[4] = { _PATH_BSHELL, "-c", comm, NULL };
> +  pid_t pid = -1;
>  
> -  /* Execute the command, or just check syntax? */
> -  if (noexec)
> -    args[1] = "-nc";
> +  /* Execute the command, or just check syntax?  */
> +  const char *args[] = { _PATH_BSHELL, noexec ? "-nc" : "-c", comm, NULL };
>  
> -  /* Redirect output.  */
> -  if (__glibc_likely (fildes[1] != STDOUT_FILENO))
> -    {
> -      __dup2 (fildes[1], STDOUT_FILENO);
> -      __close (fildes[1]);
> -    }
> -  else
> -    /* Reset the close-on-exec flag (if necessary).  */
> -    __fcntl (fildes[1], F_SETFD, 0);
> +  posix_spawn_file_actions_t fa;
> +  /* posix_spawn_file_actions_init does not fail.  */
> +  __posix_spawn_file_actions_init (&fa);
>  
> -  /* Redirect stderr to /dev/null if we have to.  */
> -  if (showerr == 0)
> +  /* Redirect output.  For check syntax only (noexec being true), exec_comm
> +     explicits sets fildes[1] to -1, so check its value to avoid a failure in
> +     __posix_spawn_file_actions_adddup2.  */
> +  if (fildes[1] != -1)
>      {
> -      struct stat64 st;
> -      int fd;
> -      __close (STDERR_FILENO);
> -      fd = __open (_PATH_DEVNULL, O_WRONLY);
> -      if (fd >= 0 && fd != STDERR_FILENO)
> +      if (__glibc_likely (fildes[1] != STDOUT_FILENO))
>  	{
> -	  __dup2 (fd, STDERR_FILENO);
> -	  __close (fd);
> +	  if (__posix_spawn_file_actions_adddup2 (&fa, fildes[1],
> +						  STDOUT_FILENO) != 0
> +	      || __posix_spawn_file_actions_addclose (&fa, fildes[1]) != 0)
> +	    goto out;
>  	}
> -      /* Be paranoid.  Check that we actually opened the /dev/null
> -	 device.  */
> -      if (__builtin_expect (__fxstat64 (_STAT_VER, STDERR_FILENO, &st), 0) != 0
> -	  || __builtin_expect (S_ISCHR (st.st_mode), 1) == 0
> -#if defined DEV_NULL_MAJOR && defined DEV_NULL_MINOR
> -	  || st.st_rdev != __gnu_dev_makedev (DEV_NULL_MAJOR, DEV_NULL_MINOR)
> -#endif
> -	  )
> -	/* It's not the /dev/null device.  Stop right here.  The
> -	   problem is: how do we stop?  We use _exit() with an
> -	   hopefully unusual exit code.  */
> -	_exit (90);
> +      else
> +	/* Reset the close-on-exec flag (if necessary).  */
> +	if (__posix_spawn_file_actions_adddup2 (&fa, fildes[1], fildes[1])
> +	    != 0)
> +	  goto out;
>      }
>  
> -  /* Make sure the subshell doesn't field-split on our behalf. */
> -  __unsetenv ("IFS");
> +  /* Redirect stderr to /dev/null if we have to.  */
> +  if (!showerr)
> +    if (__posix_spawn_file_actions_addopen (&fa, STDERR_FILENO, _PATH_DEVNULL,
> +					    O_WRONLY, 0) != 0)
> +      goto out;
> +
> +  char **newenv = create_environment ();
> +  if (newenv == NULL)
> +    goto out;
>  
> -  __close (fildes[0]);
> -  __execve (_PATH_BSHELL, (char *const *) args, __environ);
> +  /* pid is unset if posix_spawn fails, so it keep the original value
> +     of -1.  */
> +  __posix_spawn (&pid, _PATH_BSHELL, &fa, NULL, (char *const *) args, newenv);
>  
> -  /* Bad.  What now?  */
> -  abort ();
> +  free_environment (newenv);
> +
> +out:
> +  __posix_spawn_file_actions_destroy (&fa);
> +
> +  return pid;
>  }
>  
>  /* Function to execute a command and retrieve the results */
> @@ -884,13 +898,13 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
>    size_t maxnewlines = 0;
>    char buffer[bufsize];
>    pid_t pid;
> -  int noexec = 0;
> +  bool noexec = false;
>  
>    /* Do nothing if command substitution should not succeed.  */
>    if (flags & WRDE_NOCMD)
>      return WRDE_CMDSUB;
>  
> -  /* Don't fork() unless necessary */
> +  /* Don't posix_spawn() unless necessary */
>    if (!comm || !*comm)
>      return 0;
>  
> @@ -898,19 +912,15 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
>      return WRDE_NOSPACE;
>  
>   again:
> -  if ((pid = __fork ()) < 0)
> +  pid = exec_comm_child (comm, fildes, noexec ? false : flags & WRDE_SHOWERR,
> +			 noexec);
> +  if (pid < 0)
>      {
> -      /* Bad */
>        __close (fildes[0]);
>        __close (fildes[1]);
>        return WRDE_NOSPACE;
>      }
>  
> -  if (pid == 0)
> -    exec_comm_child (comm, fildes, noexec ? 0 : flags & WRDE_SHOWERR, noexec);
> -
> -  /* Parent */
> -
>    /* If we are just testing the syntax, only wait.  */
>    if (noexec)
>      return (TEMP_FAILURE_RETRY (__waitpid (pid, &status, 0)) == pid
> @@ -1091,7 +1101,7 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
>    /* Check for syntax error (re-execute but with "-n" flag) */
>    if (buflen < 1 && status != 0)
>      {
> -      noexec = 1;
> +      noexec = true;
>        goto again;
>      }
>  
> @@ -1143,26 +1153,9 @@ parse_comm (char **word, size_t *word_length, size_t *max_length,
>  	      /* Go -- give script to the shell */
>  	      if (comm)
>  		{
> -#ifdef __libc_ptf_call
> -		  /* We do not want the exec_comm call to be cut short
> -		     by a thread cancellation since cleanup is very
> -		     ugly.  Therefore disable cancellation for
> -		     now.  */
> -		  // XXX Ideally we do want the thread being cancelable.
> -		  // XXX If demand is there we'll change it.
> -		  int state = PTHREAD_CANCEL_ENABLE;
> -		  __libc_ptf_call (__pthread_setcancelstate,
> -				   (PTHREAD_CANCEL_DISABLE, &state), 0);
> -#endif
> -
> +		  /* posix_spawn already handles thread cancellation.  */
>  		  error = exec_comm (comm, word, word_length, max_length,
>  				     flags, pwordexp, ifs, ifs_white);
> -
> -#ifdef __libc_ptf_call
> -		  __libc_ptf_call (__pthread_setcancelstate,
> -				   (state, NULL), 0);
> -#endif
> -
>  		  free (comm);
>  		}
>  
> 

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-07-31 18:31 [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback Adhemerval Zanella
                   ` (4 preceding siblings ...)
  2019-08-28 14:09 ` [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback Adhemerval Zanella
@ 2019-08-28 14:35 ` Andreas Schwab
  2019-08-28 17:01   ` Adhemerval Zanella
  2019-08-28 14:42 ` Florian Weimer
  2019-08-30  9:53 ` Florian Weimer
  7 siblings, 1 reply; 59+ messages in thread
From: Andreas Schwab @ 2019-08-28 14:35 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

On Jul 31 2019, Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote:

>  #ifdef __NR_getdents64
> -  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
> -  if (ret != -1)
> -    return ret;
> +  static bool getdents64_supportted = true;

s/supportted/supported/

Andreas.

-- 
Andreas Schwab, SUSE Labs, schwab@suse.de
GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE  1748 E4D4 88E3 0EEA B9D7
"And now for something completely different."

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-07-31 18:31 [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback Adhemerval Zanella
                   ` (5 preceding siblings ...)
  2019-08-28 14:35 ` Andreas Schwab
@ 2019-08-28 14:42 ` Florian Weimer
  2019-08-28 21:02   ` Adhemerval Zanella
  2019-08-30  9:53 ` Florian Weimer
  7 siblings, 1 reply; 59+ messages in thread
From: Florian Weimer @ 2019-08-28 14:42 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

* Adhemerval Zanella:

>    struct kernel_dirent
> +  {
> +    unsigned long d_ino;
> +    unsigned long d_off;
> +    unsigned short int d_reclen;
> +    char d_name[1];
> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];

I think it's still not clear to me in which cases we actually need to
move the dirent entries in the buffer.  My impression is that we just
need to move d_name by one byte because before, d_type was after the
name, and afterwards, it comes before the name.  But the record
boundaries are unchanged.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-08-28 14:35 ` Andreas Schwab
@ 2019-08-28 17:01   ` Adhemerval Zanella
  0 siblings, 0 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-08-28 17:01 UTC (permalink / raw)
  To: Andreas Schwab; +Cc: libc-alpha



On 28/08/2019 11:34, Andreas Schwab wrote:
> On Jul 31 2019, Adhemerval Zanella <adhemerval.zanella@linaro.org> wrote:
> 
>>  #ifdef __NR_getdents64
>> -  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
>> -  if (ret != -1)
>> -    return ret;
>> +  static bool getdents64_supportted = true;
> 
> s/supportted/supported/
> 
> Andreas.
> 

Fixed, thanks.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 2/5] posix: Add posix_spawn_file_actions_closefrom
  2019-08-28 14:09   ` Adhemerval Zanella
@ 2019-08-28 17:22     ` Joseph Myers
  2019-08-28 21:03       ` Adhemerval Zanella
  0 siblings, 1 reply; 59+ messages in thread
From: Joseph Myers @ 2019-08-28 17:22 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

This needs symbol version updates for 2.31.

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-08-28 14:42 ` Florian Weimer
@ 2019-08-28 21:02   ` Adhemerval Zanella
  2019-08-28 21:23     ` Florian Weimer
  0 siblings, 1 reply; 59+ messages in thread
From: Adhemerval Zanella @ 2019-08-28 21:02 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha



On 28/08/2019 11:42, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>>    struct kernel_dirent
>> +  {
>> +    unsigned long d_ino;
>> +    unsigned long d_off;
>> +    unsigned short int d_reclen;
>> +    char d_name[1];
>> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
> 
> I think it's still not clear to me in which cases we actually need to
> move the dirent entries in the buffer.  My impression is that we just
> need to move d_name by one byte because before, d_type was after the
> name, and afterwards, it comes before the name.  But the record
> boundaries are unchanged.

My understanding is the record boundary would be same as long the 
d_name fits on the alignment padding space minus the size of the
d_type.  Otherwise the dirent64 will need to be extended.

This leads to possible memmoves the whole buffer obtained from kernel
on which iteration (which for large buffer is another performance
drain) and possible lseek for the case where registers are slide out
the buffer.  I strongly think using an auxiliary buffer is still
simpler than operating in-place.

Also, mips64-n32 has another issue I found recently: it uses the
compat syscall which uses the compat dirent with both d_ino/d_off
as 32 bits.  For in-place getdents call it will require even more
memory moves. Below is an updated patch to fix it:

---

diff --git a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
index 8bf3abb0e0..881b5eb651 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
+++ b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
@@ -22,98 +22,92 @@
 #include <assert.h>
 #include <sys/param.h>
 #include <unistd.h>
-#include <scratch_buffer.h>
 #include <limits.h>
 
 ssize_t
-__getdents64 (int fd, void *buf0, size_t nbytes)
+__getdents64 (int fd, void *buf, size_t nbytes)
 {
-  char *buf = buf0;
-
   /* The system call takes an unsigned int argument, and some length
      checks in the kernel use an int type.  */
   if (nbytes > INT_MAX)
     nbytes = INT_MAX;
 
 #ifdef __NR_getdents64
-  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
-  if (ret != -1)
-    return ret;
+  static bool getdents64_supported = true;
+  if (atomic_load_relaxed (&getdents64_supported))
+    {
+      ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
+      if (ret >= 0 || errno != ENOSYS)
+	return ret;
+
+      atomic_store_relaxed (&getdents64_supported, false);
+    }
 #endif
 
   /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.
-     If syscall is not available it need to fallback to non-LFS one.  */
+     If the syscall is not available it need to fallback to the non-LFS one.
+     Also to avoid an unbounded allocation through VLA/alloca or malloc (which
+     would make the syscall non async-signal-safe) it uses a limited buffer.
+     This is sub-optimal for large NBYTES, however this is a fallback
+     mechanism to emulate a syscall that kernel should provide.   */
 
+  enum { KBUF_SIZE = 1024 };
   struct kernel_dirent
-    {
-      unsigned long d_ino;
-      unsigned long d_off;
-      unsigned short int d_reclen;
-      char d_name[256];
-    };
+  {
+#if _MIPS_SIM == _ABI64
+    uint64_t d_ino;
+    uint64_t d_off;
+#else
+    uint32_t d_ino;
+    uint32_t d_off;
+#endif
+    unsigned short int d_reclen;
+    char d_name[1];
+  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
+  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
 
   const size_t size_diff = (offsetof (struct dirent64, d_name)
-			   - offsetof (struct kernel_dirent, d_name));
-
-  size_t red_nbytes = MIN (nbytes
-			   - ((nbytes / (offsetof (struct dirent64, d_name)
-					 + 14)) * size_diff),
-			   nbytes - size_diff);
-
-  struct scratch_buffer tmpbuf;
-  scratch_buffer_init (&tmpbuf);
-  if (!scratch_buffer_set_array_size (&tmpbuf, red_nbytes, sizeof (uint8_t)))
-    INLINE_SYSCALL_ERROR_RETURN_VALUE (ENOMEM);
-
-  struct kernel_dirent *skdp, *kdp;
-  skdp = kdp = tmpbuf.data;
+                           - offsetof (struct kernel_dirent, d_name));
 
-  ssize_t retval = INLINE_SYSCALL_CALL (getdents, fd, kdp, red_nbytes);
-  if (retval == -1)
-    {
-      scratch_buffer_free (&tmpbuf);
-      return -1;
-    }
+  struct dirent64 *dp = (struct dirent64 *) buf;
 
+  size_t nb = 0;
   off64_t last_offset = -1;
-  struct dirent64 *dp = (struct dirent64 *) buf;
-  while ((char *) kdp < (char *) skdp + retval)
+
+  ssize_t r;
+  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)
     {
-      const size_t alignment = _Alignof (struct dirent64);
-      /* Since kdp->d_reclen is already aligned for the kernel structure
-	 this may compute a value that is bigger than necessary.  */
-      size_t new_reclen = ((kdp->d_reclen + size_diff + alignment - 1)
-			   & ~(alignment - 1));
-      if ((char *) dp + new_reclen > buf + nbytes)
-        {
-	  /* Our heuristic failed.  We read too many entries.  Reset
-	     the stream.  */
-	  assert (last_offset != -1);
-	  __lseek64 (fd, last_offset, SEEK_SET);
-
-	  if ((char *) dp == buf)
+      struct kernel_dirent *skdp, *kdp;
+      skdp = kdp = kbuf;
+
+      while ((char *) kdp < (char *) skdp + r)
+	{
+	  const size_t alignment = _Alignof (struct dirent64);
+	  size_t new_reclen = ((kdp->d_reclen + size_diff + alignment - 1)
+			      & ~(alignment - 1));
+	  if (nb + new_reclen > nbytes)
 	    {
-	      scratch_buffer_free (&tmpbuf);
-	      return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
+		/* The new entry will overflow the input buffer, rewind to
+		   last obtained entry and return.  */
+	       __lseek64 (fd, last_offset, SEEK_SET);
+	       goto out;
 	    }
+	  nb += new_reclen;
 
-	  break;
-	}
-
-      last_offset = kdp->d_off;
-      dp->d_ino = kdp->d_ino;
-      dp->d_off = kdp->d_off;
-      dp->d_reclen = new_reclen;
-      dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
-      memcpy (dp->d_name, kdp->d_name,
-	      kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
+	  dp->d_ino = kdp->d_ino;
+	  dp->d_off = last_offset = kdp->d_off;
+	  dp->d_reclen = new_reclen;
+	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
+	  memcpy (dp->d_name, kdp->d_name,
+		  kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
 
-      dp = (struct dirent64 *) ((char *) dp + new_reclen);
-      kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
+	  dp = (struct dirent64 *) ((char *) dp + new_reclen);
+	  kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
+	}
     }
 
-  scratch_buffer_free (&tmpbuf);
-  return (char *) dp - buf;
+out:
+  return (char *) dp - (char *) buf;
 }
 libc_hidden_def (__getdents64)
 weak_alias (__getdents64, getdents64)

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 2/5] posix: Add posix_spawn_file_actions_closefrom
  2019-08-28 17:22     ` Joseph Myers
@ 2019-08-28 21:03       ` Adhemerval Zanella
  0 siblings, 0 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-08-28 21:03 UTC (permalink / raw)
  To: Joseph Myers; +Cc: libc-alpha



On 28/08/2019 14:21, Joseph Myers wrote:
> This needs symbol version updates for 2.31.
> 

Ack, I will update is and re-send it.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-08-28 21:02   ` Adhemerval Zanella
@ 2019-08-28 21:23     ` Florian Weimer
  2019-08-29 11:04       ` Adhemerval Zanella
  0 siblings, 1 reply; 59+ messages in thread
From: Florian Weimer @ 2019-08-28 21:23 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

* Adhemerval Zanella:

> On 28/08/2019 11:42, Florian Weimer wrote:
>> * Adhemerval Zanella:
>> 
>>>    struct kernel_dirent
>>> +  {
>>> +    unsigned long d_ino;
>>> +    unsigned long d_off;
>>> +    unsigned short int d_reclen;
>>> +    char d_name[1];
>>> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
>> 
>> I think it's still not clear to me in which cases we actually need to
>> move the dirent entries in the buffer.  My impression is that we just
>> need to move d_name by one byte because before, d_type was after the
>> name, and afterwards, it comes before the name.  But the record
>> boundaries are unchanged.
>
> My understanding is the record boundary would be same as long the 
> d_name fits on the alignment padding space minus the size of the
> d_type.  Otherwise the dirent64 will need to be extended.

Hmm.  The problem is mips64 n32, right?  Where unsigned long is 32 bits?

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-07-31 18:31 ` [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls Adhemerval Zanella
  2019-08-28 14:09   ` Adhemerval Zanella
@ 2019-08-29  8:38   ` Florian Weimer
  2019-08-29 11:26     ` Adhemerval Zanella
  1 sibling, 1 reply; 59+ messages in thread
From: Florian Weimer @ 2019-08-29  8:38 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

* Adhemerval Zanella:

> 	* include/atomic.h (atomic_fetch_or_seq_cst, atomic_fetch_or_seq_cst):
> 	New macros.

Why isn't a regular release store/acquire load synchronization
sufficient here?

I wonder if we can get kernel support for this in the new clone system
call with more flags.  Then we don't have to complicate the sigaction
implementation.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-08-28 21:23     ` Florian Weimer
@ 2019-08-29 11:04       ` Adhemerval Zanella
  0 siblings, 0 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-08-29 11:04 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha



On 28/08/2019 18:23, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> On 28/08/2019 11:42, Florian Weimer wrote:
>>> * Adhemerval Zanella:
>>>
>>>>    struct kernel_dirent
>>>> +  {
>>>> +    unsigned long d_ino;
>>>> +    unsigned long d_off;
>>>> +    unsigned short int d_reclen;
>>>> +    char d_name[1];
>>>> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
>>>
>>> I think it's still not clear to me in which cases we actually need to
>>> move the dirent entries in the buffer.  My impression is that we just
>>> need to move d_name by one byte because before, d_type was after the
>>> name, and afterwards, it comes before the name.  But the record
>>> boundaries are unchanged.
>>
>> My understanding is the record boundary would be same as long the 
>> d_name fits on the alignment padding space minus the size of the
>> d_type.  Otherwise the dirent64 will need to be extended.
> 
> Hmm.  The problem is mips64 n32, right?  Where unsigned long is 32 bits?

Yes, it would require either a mips64-n32 specific path or implementation.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-08-29  8:38   ` Florian Weimer
@ 2019-08-29 11:26     ` Adhemerval Zanella
  2019-08-30 10:07       ` Florian Weimer
  0 siblings, 1 reply; 59+ messages in thread
From: Adhemerval Zanella @ 2019-08-29 11:26 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha



On 29/08/2019 05:38, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> 	* include/atomic.h (atomic_fetch_or_seq_cst, atomic_fetch_or_seq_cst):
>> 	New macros.
> 
> Why isn't a regular release store/acquire load synchronization
> sufficient here?

It should works, my understanding is a weaker store barrier might incur in
a slight more false positive in a highly concurrent sigaction call scenario.
But I assume that this is not a common scenario, so I used the strongest
barrier just to avoid the extra false positives.

> 
> I wonder if we can get kernel support for this in the new clone system
> call with more flags.  Then we don't have to complicate the sigaction
> implementation.

Maybe a CLONE_RESET_SIGNALS where the cloned process sets its signal
disposition to default SIG_IGN/SIG_DFL values may help us here.  However
afaik clone now is out of space on 'flags' for newer ones (it already
defines 24 flags plus it reserve 8 bits for signal to be sent at process
exit) and it would take time to use this feature on glibc.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-07-31 18:31 [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback Adhemerval Zanella
                   ` (6 preceding siblings ...)
  2019-08-28 14:42 ` Florian Weimer
@ 2019-08-30  9:53 ` Florian Weimer
  2019-08-30 12:53   ` Adhemerval Zanella
  7 siblings, 1 reply; 59+ messages in thread
From: Florian Weimer @ 2019-08-30  9:53 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

Sorry, I missed that despite all the 64s in the patch, this code is also
used on 32-bit architectures.  The review below should take this new (to
me) piece of information into account.

* Adhemerval Zanella:

> +  static bool getdents64_supportted = true;
> +  if (atomic_load_relaxed (&getdents64_supportted))

Our atomics do not support bool, only 4 bytes and 8 bytes (if
__HAVE_64B_ATOMICS) is defined.  See __atomic_check_size.

Probably it will work if it compiles, but I haven't checked that.

>    /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.
> +     If the syscall is not available it need to fallback to the non-LFS one.
> +     Also to avoid an unbounded allocation through VLA/alloca or malloc (which
> +     would make the syscall non async-signal-safe) it uses a limited buffer.
> +     This is sub-optimal for large NBYTES, however this is a fallback
> +     mechanism to emulate a syscall that kernel should provide.   */
>  
> +  enum { KBUF_SIZE = 1024 };

The choice of size needs a comment.  I think the largest possible
practical length of the d_name member are 255 Unicode characters in the
BMP, in UTF-8 encoding, so d_name is 766 bytes long, plus 10 bytes from
the header, for 776 bytes total.  (NAME_MAX is not a constant on Linux
in reality.)

>    struct kernel_dirent
> +  {
> +    unsigned long d_ino;
> +    unsigned long d_off;
> +    unsigned short int d_reclen;
> +    char d_name[1];
> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;

I would define kbuf as a char array, and perhaps leave out the d_name
member in struct kernel_dirent.  You can copy out the struct
kernel_dirent using memcpy, which GCC should optimize away.

Ideally, we would perform the conversion in-line, with a forward scan to
make the d_reclen members point backwards, followed by a backwards scan
to move everything in place.  This would reduce stack usage quite
significantly and avoid a hard restriction on d_name length.

>    struct dirent64 *dp = (struct dirent64 *) buf;
> +
> +  size_t nb = 0;
> +  off64_t last_offset = -1;
> +
> +  ssize_t r;
> +  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)
>      {

Sorry, I don't see how the outer loop is exited.  I think we should
remove it because it does not seem necessary.

> +      struct kernel_dirent *skdp, *kdp;
> +      skdp = kdp = kbuf;
> +
> +      while ((char *) kdp < (char *) skdp + r)
> +	{
> +	  const size_t alignment = _Alignof (struct dirent64);
> +	  size_t new_reclen = ((kdp->d_reclen + alignment - 1)
> +			      & ~(alignment - 1));

I think this is the roundup macro.  If you use that, I think you don't
need the alignment variable.

Is the length really correct, though?  I'd expect it to grow by the
additional size of the d_ino and d_off members.  I think it would be
best recompute it from scratch, using the actual length of d_name.

> +	  if (nb + new_reclen > nbytes)
>  	    {
> +		/* The new entry will overflow the input buffer, rewind to
> +		   last obtained entry and return.  */
> +	       __lseek64 (fd, last_offset, SEEK_SET);

I don't think last_offset is guaranteed to have been set with a proper
offset at this point.  Given that d_name is essentially of unbounded
length, even expanding the first entry can cause failure.

Maybe it's possible to avoid this corner case by limiting the amount of
data being read so that we know that the application-supplied buffer is
always large enough for any possible expansion.  I think the worse-case
growth is for lengths 5 to 8, from 20 bytes to 32 bytes.  So perhaps we
should divide the buffer size by 1.6 and use that?

> +	       goto out;
>  	    }
> +	  nb += new_reclen;
>  
> +	  dp->d_ino = kdp->d_ino;
> +	  dp->d_off = last_offset = kdp->d_off;
> +	  dp->d_reclen = new_reclen;
> +	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);

I think instead of reading through kdp, you should use char *s and
memcpy, to avoid the aliasing violation, as discussed above.  Likewise
for writing to dp.

> +	  memcpy (dp->d_name, kdp->d_name,
> +		  kdp->d_reclen - offsetof (struct kernel_dirent, d_name));

See above, I have concerns about the length.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-08-29 11:26     ` Adhemerval Zanella
@ 2019-08-30 10:07       ` Florian Weimer
  2019-08-30 13:05         ` Adhemerval Zanella
  0 siblings, 1 reply; 59+ messages in thread
From: Florian Weimer @ 2019-08-30 10:07 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

* Adhemerval Zanella:

> On 29/08/2019 05:38, Florian Weimer wrote:
>> * Adhemerval Zanella:
>> 
>>> 	* include/atomic.h (atomic_fetch_or_seq_cst, atomic_fetch_or_seq_cst):
>>> 	New macros.
>> 
>> Why isn't a regular release store/acquire load synchronization
>> sufficient here?
>
> It should works, my understanding is a weaker store barrier might incur in
> a slight more false positive in a highly concurrent sigaction call scenario.
> But I assume that this is not a common scenario, so I used the strongest
> barrier just to avoid the extra false positives.

I don't see how false positives are possible.  It would require bits
getting set which have never been added to the mask, which would be a
bug even for relaxed MO (as a QoI issue, the memory model is buggy and
allows this).

My main worry would be reading an outdated value in posix_spawn, but my
understanding is that the release store/acquire load synchronization
avoids that.

>> I wonder if we can get kernel support for this in the new clone system
>> call with more flags.  Then we don't have to complicate the sigaction
>> implementation.
>
> Maybe a CLONE_RESET_SIGNALS where the cloned process sets its signal
> disposition to default SIG_IGN/SIG_DFL values may help us here.  However
> afaik clone now is out of space on 'flags' for newer ones (it already
> defines 24 flags plus it reserve 8 bits for signal to be sent at process
> exit) and it would take time to use this feature on glibc.

Christian Brauner has been working on fixing this.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-08-30  9:53 ` Florian Weimer
@ 2019-08-30 12:53   ` Adhemerval Zanella
  2019-09-02 12:59     ` Florian Weimer
  0 siblings, 1 reply; 59+ messages in thread
From: Adhemerval Zanella @ 2019-08-30 12:53 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha



On 30/08/2019 06:52, Florian Weimer wrote:
> Sorry, I missed that despite all the 64s in the patch, this code is also
> used on 32-bit architectures.  The review below should take this new (to
> me) piece of information into account.
> 
> * Adhemerval Zanella:
> 
>> +  static bool getdents64_supportted = true;
>> +  if (atomic_load_relaxed (&getdents64_supportted))
> 
> Our atomics do not support bool, only 4 bytes and 8 bytes (if
> __HAVE_64B_ATOMICS) is defined.  See __atomic_check_size.
> 
> Probably it will work if it compiles, but I haven't checked that.

MIPS in fact does not define USE_ATOMIC_COMPILER_BUILTINS and thus it uses
the __atomic_check_size_ls macro.  In any case, I changed to a int for this 
case.

> 
>>    /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.
>> +     If the syscall is not available it need to fallback to the non-LFS one.
>> +     Also to avoid an unbounded allocation through VLA/alloca or malloc (which
>> +     would make the syscall non async-signal-safe) it uses a limited buffer.
>> +     This is sub-optimal for large NBYTES, however this is a fallback
>> +     mechanism to emulate a syscall that kernel should provide.   */
>>  
>> +  enum { KBUF_SIZE = 1024 };
> 
> The choice of size needs a comment.  I think the largest possible
> practical length of the d_name member are 255 Unicode characters in the
> BMP, in UTF-8 encoding, so d_name is 766 bytes long, plus 10 bytes from
> the header, for 776 bytes total.  (NAME_MAX is not a constant on Linux
> in reality.)

I picked the buffer as an arbitrary value, what about:

  /* The largest possible practical length of the d_name member are 255
     Unicode characters in UTF-8 encoding, so d_name is 766 bytes long, plus
     18 (mips64) / 10 (mips64n32) bytes from header, for total of 784 (mips64)
     / 776 (mips64n32) bytes total.  Ensure that the minimum size hold at
     least one entry.  */
  enum { KBUF_SIZE = 1024 };


> 
>>    struct kernel_dirent
>> +  {
>> +    unsigned long d_ino;
>> +    unsigned long d_off;
>> +    unsigned short int d_reclen;
>> +    char d_name[1];
>> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
>> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
> 
> I would define kbuf as a char array, and perhaps leave out the d_name
> member in struct kernel_dirent.  You can copy out the struct
> kernel_dirent using memcpy, which GCC should optimize away.

I defined the buffer as 'struct kernel_dirent' to make it easier to align
for the required fields.  It allows simplify the access on the loop to
avoid memcpy calls.

> 
> Ideally, we would perform the conversion in-line, with a forward scan to
> make the d_reclen members point backwards, followed by a backwards scan
> to move everything in place.  This would reduce stack usage quite
> significantly and avoid a hard restriction on d_name length.

I take this fallback code as a best effort due the restrictions we have
(make it async-signal-safe with bounded stack allocation).  Even with a
clever buffer managements we can't remove the d_name length with the
aforementioned restrictions.  The proper solution is indeed using the
syscall.

> 
>>    struct dirent64 *dp = (struct dirent64 *) buf;
>> +
>> +  size_t nb = 0;
>> +  off64_t last_offset = -1;
>> +
>> +  ssize_t r;
>> +  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)
>>      {
> 
> Sorry, I don't see how the outer loop is exited.  I think we should
> remove it because it does not seem necessary.

We still need to handle the cases where NBYTES are larger than the temporary
buffer, because it might require multiple getdents calls.


> 
>> +      struct kernel_dirent *skdp, *kdp;
>> +      skdp = kdp = kbuf;
>> +
>> +      while ((char *) kdp < (char *) skdp + r)
>> +	{
>> +	  const size_t alignment = _Alignof (struct dirent64);
>> +	  size_t new_reclen = ((kdp->d_reclen + alignment - 1)
>> +			      & ~(alignment - 1));
> 
> I think this is the roundup macro.  If you use that, I think you don't
> need the alignment variable.

I changed to use ALIGN_UP from libc-pointer-arith.h.

> 
> Is the length really correct, though?  I'd expect it to grow by the
> additional size of the d_ino and d_off members.  I think it would be
> best recompute it from scratch, using the actual length of d_name.

It it because you are referencing to an older patch version, checking on
mips64-n32 I adjusted to:

  const size_t size_diff = (offsetof (struct dirent64, d_name)
                           - offsetof (struct kernel_dirent, d_name));
  [...]
               size_t new_reclen = ALIGN_UP (kdp->d_reclen + size_diff,
                                        _Alignof (struct dirent64));
  [...]

> 
>> +	  if (nb + new_reclen > nbytes)
>>  	    {
>> +		/* The new entry will overflow the input buffer, rewind to
>> +		   last obtained entry and return.  */
>> +	       __lseek64 (fd, last_offset, SEEK_SET);
> 
> I don't think last_offset is guaranteed to have been set with a proper
> offset at this point.  Given that d_name is essentially of unbounded
> length, even expanding the first entry can cause failure.
> 
> Maybe it's possible to avoid this corner case by limiting the amount of
> data being read so that we know that the application-supplied buffer is
> always large enough for any possible expansion.  I think the worse-case
> growth is for lengths 5 to 8, from 20 bytes to 32 bytes.  So perhaps we
> should divide the buffer size by 1.6 and use that?

For this case I really think we just need to return an error to user:

            if (nb + new_reclen > nbytes)
            {   
		/* Entry is too large for the static buffer.  */
		if (last_offset == -1)
		  {
		    __set_errno (EINVAL);
		    return -1;
		  }
                /* The new entry will overflow the input buffer, rewind to
                   last obtained entry and return.  */
               __lseek64 (fd, last_offset, SEEK_SET);
               goto out;
            }

Again I see this fallback code as a best-effort since we are emulating the
syscall with additional restraints.  Most of time glibc tries to play smart
emulating a syscall ended in a lot of headaches...


> 
>> +	       goto out;
>>  	    }
>> +	  nb += new_reclen;
>>  
>> +	  dp->d_ino = kdp->d_ino;
>> +	  dp->d_off = last_offset = kdp->d_off;
>> +	  dp->d_reclen = new_reclen;
>> +	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
> 
> I think instead of reading through kdp, you should use char *s and
> memcpy, to avoid the aliasing violation, as discussed above.  Likewise
> for writing to dp.

I think if we proper setting the buffer alignment there is no need to do it.
Also the problem of using memcpy here is for mips64n32 the size is *not* 
equal for dp and kdp, each would require an extra step as:

   {
     typeof (kdp->d_ino) kino;
     memcpy (&kino, &kdp_d->ino, sizeof (kino));
     typeof (dp->d_ino) dino = kino;
     memcpy (&dp->d_ino, &kino, sizeof (dino));
   }

> 
>> +	  memcpy (dp->d_name, kdp->d_name,
>> +		  kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
> 
> See above, I have concerns about the length.
> 
> Thanks,
> Florian
> 

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-08-30 10:07       ` Florian Weimer
@ 2019-08-30 13:05         ` Adhemerval Zanella
  2019-09-02 13:14           ` Florian Weimer
  0 siblings, 1 reply; 59+ messages in thread
From: Adhemerval Zanella @ 2019-08-30 13:05 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha



On 30/08/2019 07:07, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> On 29/08/2019 05:38, Florian Weimer wrote:
>>> * Adhemerval Zanella:
>>>
>>>> 	* include/atomic.h (atomic_fetch_or_seq_cst, atomic_fetch_or_seq_cst):
>>>> 	New macros.
>>>
>>> Why isn't a regular release store/acquire load synchronization
>>> sufficient here?
>>
>> It should works, my understanding is a weaker store barrier might incur in
>> a slight more false positive in a highly concurrent sigaction call scenario.
>> But I assume that this is not a common scenario, so I used the strongest
>> barrier just to avoid the extra false positives.
> 
> I don't see how false positives are possible.  It would require bits
> getting set which have never been added to the mask, which would be a
> bug even for relaxed MO (as a QoI issue, the memory model is buggy and
> allows this).
> 
> My main worry would be reading an outdated value in posix_spawn, but my
> understanding is that the release store/acquire load synchronization
> avoids that.

The false positives happens for the case where the signal disposition was
set, the bit in the bitmask set, and the signal disposition reset to default 
value.  The bit will be stick and posix_spawn will always issue the sigaction 
(even though strictly it is not required).

The problem is in fact false negatives, where posix_spawn will get a mask 
*without* the bit set, but with a set signal disposition.  In fact I think
due the syscall, even relaxed operations would work (since the syscall acts
a strong memory barrier).

> 
>>> I wonder if we can get kernel support for this in the new clone system
>>> call with more flags.  Then we don't have to complicate the sigaction
>>> implementation.
>>
>> Maybe a CLONE_RESET_SIGNALS where the cloned process sets its signal
>> disposition to default SIG_IGN/SIG_DFL values may help us here.  However
>> afaik clone now is out of space on 'flags' for newer ones (it already
>> defines 24 flags plus it reserve 8 bits for signal to be sent at process
>> exit) and it would take time to use this feature on glibc.
> 
> Christian Brauner has been working on fixing this.

Which strategy he is proposing? Even with proper kernel support, it would
take time to enable glibc to use it.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-08-30 12:53   ` Adhemerval Zanella
@ 2019-09-02 12:59     ` Florian Weimer
  2019-09-02 17:38       ` Adhemerval Zanella
  0 siblings, 1 reply; 59+ messages in thread
From: Florian Weimer @ 2019-09-02 12:59 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

* Adhemerval Zanella:

>> The choice of size needs a comment.  I think the largest possible
>> practical length of the d_name member are 255 Unicode characters in the
>> BMP, in UTF-8 encoding, so d_name is 766 bytes long, plus 10 bytes from
>> the header, for 776 bytes total.  (NAME_MAX is not a constant on Linux
>> in reality.)
>
> I picked the buffer as an arbitrary value, what about:
>
>   /* The largest possible practical length of the d_name member are 255
>      Unicode characters in UTF-8 encoding, so d_name is 766 bytes long, plus
>      18 (mips64) / 10 (mips64n32) bytes from header, for total of 784 (mips64)
>      / 776 (mips64n32) bytes total.  Ensure that the minimum size hold at
>      least one entry.  */
>   enum { KBUF_SIZE = 1024 };

“holds”

Looks good.

>>>    struct kernel_dirent
>>> +  {
>>> +    unsigned long d_ino;
>>> +    unsigned long d_off;
>>> +    unsigned short int d_reclen;
>>> +    char d_name[1];
>>> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
>>> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
>> 
>> I would define kbuf as a char array, and perhaps leave out the d_name
>> member in struct kernel_dirent.  You can copy out the struct
>> kernel_dirent using memcpy, which GCC should optimize away.
>
> I defined the buffer as 'struct kernel_dirent' to make it easier to align
> for the required fields.  It allows simplify the access on the loop to
> avoid memcpy calls.

But the code is invalid C as a result of this.  We do not compile glibc
with -fno-strict-aliasing, after all.

>>>    struct dirent64 *dp = (struct dirent64 *) buf;
>>> +
>>> +  size_t nb = 0;
>>> +  off64_t last_offset = -1;
>>> +
>>> +  ssize_t r;
>>> +  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)
>>>      {
>> 
>> Sorry, I don't see how the outer loop is exited.  I think we should
>> remove it because it does not seem necessary.
>
> We still need to handle the cases where NBYTES are larger than the temporary
> buffer, because it might require multiple getdents calls.

Why?  The application (or readdir) will call us again to get more entries.

>> Is the length really correct, though?  I'd expect it to grow by the
>> additional size of the d_ino and d_off members.  I think it would be
>> best recompute it from scratch, using the actual length of d_name.
>
> It it because you are referencing to an older patch version, checking on
> mips64-n32 I adjusted to:
>
>   const size_t size_diff = (offsetof (struct dirent64, d_name)
>                            - offsetof (struct kernel_dirent, d_name));
>   [...]
>                size_t new_reclen = ALIGN_UP (kdp->d_reclen + size_diff,
>                                         _Alignof (struct dirent64));
>   [...]

Okay, this needs a comment that this is a conservative approximation
(some of size_diff might fit into the existing padding for alignment).

>>> +	  if (nb + new_reclen > nbytes)
>>>  	    {
>>> +		/* The new entry will overflow the input buffer, rewind to
>>> +		   last obtained entry and return.  */
>>> +	       __lseek64 (fd, last_offset, SEEK_SET);
>> 
>> I don't think last_offset is guaranteed to have been set with a proper
>> offset at this point.  Given that d_name is essentially of unbounded
>> length, even expanding the first entry can cause failure.
>> 
>> Maybe it's possible to avoid this corner case by limiting the amount of
>> data being read so that we know that the application-supplied buffer is
>> always large enough for any possible expansion.  I think the worse-case
>> growth is for lengths 5 to 8, from 20 bytes to 32 bytes.  So perhaps we
>> should divide the buffer size by 1.6 and use that?
>
> For this case I really think we just need to return an error to user:
>
>             if (nb + new_reclen > nbytes)
>             {   
> 		/* Entry is too large for the static buffer.  */

Fixed-size buffer, it's not static. 8-)

> 		if (last_offset == -1)
> 		  {
> 		    __set_errno (EINVAL);
> 		    return -1;
> 		  }
>                 /* The new entry will overflow the input buffer, rewind to
>                    last obtained entry and return.  */
>                __lseek64 (fd, last_offset, SEEK_SET);
>                goto out;
>             }
>
> Again I see this fallback code as a best-effort since we are emulating the
> syscall with additional restraints.  Most of time glibc tries to play smart
> emulating a syscall ended in a lot of headaches...

I don't disagree.

Which error code does the kernel return if no entry can be read at all?
We should mirror that.

>>> +	       goto out;
>>>  	    }
>>> +	  nb += new_reclen;
>>>  
>>> +	  dp->d_ino = kdp->d_ino;
>>> +	  dp->d_off = last_offset = kdp->d_off;
>>> +	  dp->d_reclen = new_reclen;
>>> +	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
>> 
>> I think instead of reading through kdp, you should use char *s and
>> memcpy, to avoid the aliasing violation, as discussed above.  Likewise
>> for writing to dp.
>
> I think if we proper setting the buffer alignment there is no need to do it.
> Also the problem of using memcpy here is for mips64n32 the size is *not* 
> equal for dp and kdp, each would require an extra step as:
>
>    {
>      typeof (kdp->d_ino) kino;
>      memcpy (&kino, &kdp_d->ino, sizeof (kino));
>      typeof (dp->d_ino) dino = kino;
>      memcpy (&dp->d_ino, &kino, sizeof (dino));
>    }

I think that's just the price of writing correct C.  It's also what the
kernel does.

I don't even think there's a requirement that the byte buffer passed to
getdents64 has any kind of alignment.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-08-30 13:05         ` Adhemerval Zanella
@ 2019-09-02 13:14           ` Florian Weimer
  2019-09-02 19:47             ` Adhemerval Zanella
  0 siblings, 1 reply; 59+ messages in thread
From: Florian Weimer @ 2019-09-02 13:14 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

* Adhemerval Zanella:

> The problem is in fact false negatives, where posix_spawn will get a mask 
> *without* the bit set, but with a set signal disposition.

Hmm.  Right.  Incidentally, the Go routine should be fine with that:

| // When using cgo, call the C library for sigaction, so that we call into
| // any sanitizer interceptors. This supports using the memory
| // sanitizer with Go programs. The memory sanitizer only applies to
| // C/C++ code; this permits that code to see the Go runtime's existing signal
| // handlers when registering new signal handlers for the process.
| 
| //go:cgo_import_static x_cgo_sigaction
| //go:linkname x_cgo_sigaction x_cgo_sigaction
| //go:linkname _cgo_sigaction _cgo_sigaction
| var x_cgo_sigaction byte
| var _cgo_sigaction = &x_cgo_sigaction

libjsig also keeps calling to glibc.

Is there anything else we should check?

> In fact I think due the syscall, even relaxed operations would work
> (since the syscall acts a strong memory barrier).

Only as a signal fence, not a thread fence.  Some architectures can even
keep cache inconsistency across fork system calls.

I find it a bit counter-intuitive that calling sigaction or signal
directly without the glibc wrappers could lead to data corruption, even
when done for standard signals such as SIGINT.  But that's what's going
to happen with this change, unfortunately.

>>>> I wonder if we can get kernel support for this in the new clone system
>>>> call with more flags.  Then we don't have to complicate the sigaction
>>>> implementation.
>>>
>>> Maybe a CLONE_RESET_SIGNALS where the cloned process sets its signal
>>> disposition to default SIG_IGN/SIG_DFL values may help us here.  However
>>> afaik clone now is out of space on 'flags' for newer ones (it already
>>> defines 24 flags plus it reserve 8 bits for signal to be sent at process
>>> exit) and it would take time to use this feature on glibc.
>> 
>> Christian Brauner has been working on fixing this.
>
> Which strategy he is proposing? Even with proper kernel support, it would
> take time to enable glibc to use it.

Lots of flag arguments, with the reset of the arguments located
indirectly via a pointer argument.

For a pure optimization, I think it's not too bad to require kernel
backports of system calls.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-09-02 12:59     ` Florian Weimer
@ 2019-09-02 17:38       ` Adhemerval Zanella
  2019-10-07 17:49         ` Adhemerval Zanella
  2019-10-07 18:29         ` Florian Weimer
  0 siblings, 2 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-09-02 17:38 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha



On 02/09/2019 09:59, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>>> The choice of size needs a comment.  I think the largest possible
>>> practical length of the d_name member are 255 Unicode characters in the
>>> BMP, in UTF-8 encoding, so d_name is 766 bytes long, plus 10 bytes from
>>> the header, for 776 bytes total.  (NAME_MAX is not a constant on Linux
>>> in reality.)
>>
>> I picked the buffer as an arbitrary value, what about:
>>
>>   /* The largest possible practical length of the d_name member are 255
>>      Unicode characters in UTF-8 encoding, so d_name is 766 bytes long, plus
>>      18 (mips64) / 10 (mips64n32) bytes from header, for total of 784 (mips64)
>>      / 776 (mips64n32) bytes total.  Ensure that the minimum size hold at
>>      least one entry.  */
>>   enum { KBUF_SIZE = 1024 };
> 
> “holds”
> 
> Looks good.
> 
>>>>    struct kernel_dirent
>>>> +  {
>>>> +    unsigned long d_ino;
>>>> +    unsigned long d_off;
>>>> +    unsigned short int d_reclen;
>>>> +    char d_name[1];
>>>> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
>>>> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
>>>
>>> I would define kbuf as a char array, and perhaps leave out the d_name
>>> member in struct kernel_dirent.  You can copy out the struct
>>> kernel_dirent using memcpy, which GCC should optimize away.
>>
>> I defined the buffer as 'struct kernel_dirent' to make it easier to align
>> for the required fields.  It allows simplify the access on the loop to
>> avoid memcpy calls.
> 
> But the code is invalid C as a result of this.  We do not compile glibc
> with -fno-strict-aliasing, after all.

Right, we indeed do some pointer arithmetic to get the kdp value. I change
to use char buffer plus memcpy to obtain the fields.

> 
>>>>    struct dirent64 *dp = (struct dirent64 *) buf;
>>>> +
>>>> +  size_t nb = 0;
>>>> +  off64_t last_offset = -1;
>>>> +
>>>> +  ssize_t r;
>>>> +  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)
>>>>      {
>>>
>>> Sorry, I don't see how the outer loop is exited.  I think we should
>>> remove it because it does not seem necessary.
>>
>> We still need to handle the cases where NBYTES are larger than the temporary
>> buffer, because it might require multiple getdents calls.
> 
> Why?  The application (or readdir) will call us again to get more entries.

As a simple optimization to avoid it, but I think we can move to a simplified
version.

> 
>>> Is the length really correct, though?  I'd expect it to grow by the
>>> additional size of the d_ino and d_off members.  I think it would be
>>> best recompute it from scratch, using the actual length of d_name.
>>
>> It it because you are referencing to an older patch version, checking on
>> mips64-n32 I adjusted to:
>>
>>   const size_t size_diff = (offsetof (struct dirent64, d_name)
>>                            - offsetof (struct kernel_dirent, d_name));
>>   [...]
>>                size_t new_reclen = ALIGN_UP (kdp->d_reclen + size_diff,
>>                                         _Alignof (struct dirent64));
>>   [...]
> 
> Okay, this needs a comment that this is a conservative approximation
> (some of size_diff might fit into the existing padding for alignment).

Ack.

> 
>>>> +	  if (nb + new_reclen > nbytes)
>>>>  	    {
>>>> +		/* The new entry will overflow the input buffer, rewind to
>>>> +		   last obtained entry and return.  */
>>>> +	       __lseek64 (fd, last_offset, SEEK_SET);
>>>
>>> I don't think last_offset is guaranteed to have been set with a proper
>>> offset at this point.  Given that d_name is essentially of unbounded
>>> length, even expanding the first entry can cause failure.
>>>
>>> Maybe it's possible to avoid this corner case by limiting the amount of
>>> data being read so that we know that the application-supplied buffer is
>>> always large enough for any possible expansion.  I think the worse-case
>>> growth is for lengths 5 to 8, from 20 bytes to 32 bytes.  So perhaps we
>>> should divide the buffer size by 1.6 and use that?
>>
>> For this case I really think we just need to return an error to user:
>>
>>             if (nb + new_reclen > nbytes)
>>             {   
>> 		/* Entry is too large for the static buffer.  */
> 
> Fixed-size buffer, it's not static. 8-)

Ack.

> 
>> 		if (last_offset == -1)
>> 		  {
>> 		    __set_errno (EINVAL);
>> 		    return -1;
>> 		  }
>>                 /* The new entry will overflow the input buffer, rewind to
>>                    last obtained entry and return.  */
>>                __lseek64 (fd, last_offset, SEEK_SET);
>>                goto out;
>>             }
>>
>> Again I see this fallback code as a best-effort since we are emulating the
>> syscall with additional restraints.  Most of time glibc tries to play smart
>> emulating a syscall ended in a lot of headaches...
> 
> I don't disagree.
> 
> Which error code does the kernel return if no entry can be read at all?
> We should mirror that.

It returns -1/EINVAL.

fs/readdir.c
177         if (reclen > buf->count)                                                                    
178                 return -EINVAL;

> 
>>>> +	       goto out;
>>>>  	    }
>>>> +	  nb += new_reclen;
>>>>  
>>>> +	  dp->d_ino = kdp->d_ino;
>>>> +	  dp->d_off = last_offset = kdp->d_off;
>>>> +	  dp->d_reclen = new_reclen;
>>>> +	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
>>>
>>> I think instead of reading through kdp, you should use char *s and
>>> memcpy, to avoid the aliasing violation, as discussed above.  Likewise
>>> for writing to dp.
>>
>> I think if we proper setting the buffer alignment there is no need to do it.
>> Also the problem of using memcpy here is for mips64n32 the size is *not* 
>> equal for dp and kdp, each would require an extra step as:
>>
>>    {
>>      typeof (kdp->d_ino) kino;
>>      memcpy (&kino, &kdp_d->ino, sizeof (kino));
>>      typeof (dp->d_ino) dino = kino;
>>      memcpy (&dp->d_ino, &kino, sizeof (dino));
>>    }
> 
> I think that's just the price of writing correct C.  It's also what the
> kernel does.

Ack.

> 
> I don't even think there's a requirement that the byte buffer passed to
> getdents64 has any kind of alignment.
> 
> Thanks,
> Florian
> 

Updated patch below.

--

This patch changes how the fallback getdents64 implementation calls
non-LFS getdents by replacing the scratch_buffer with static buffer
plus a loop on getdents calls.  This avoids the potential malloc
call on scratch_buffer_set_array_size for large input buffer size
at the cost of more getdents syscalls.

It also adds a small optimization for older kernels, where the first
ENOSYS failure for getdents64 disable subsequent calls.

Check the dirent tests on a mips64-linux-gnu with getdents64 code
disabled.

	* sysdeps/unix/sysv/linux/mips/mips64/getdents64.c (__getdents64):
	Add small optimization for older kernel to avoid issuing
	__NR_getdents64 on each call and replace scratch_buffer usage with
	a static allocated buffer.
---
 .../unix/sysv/linux/mips/mips64/getdents64.c  | 133 ++++++++++--------
 1 file changed, 76 insertions(+), 57 deletions(-)

diff --git a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
index 8bf3abb0e0..02e15a0b2e 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
+++ b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
@@ -22,88 +22,108 @@
 #include <assert.h>
 #include <sys/param.h>
 #include <unistd.h>
-#include <scratch_buffer.h>
 #include <limits.h>
 
+#include <include/libc-pointer-arith.h>
+
 ssize_t
-__getdents64 (int fd, void *buf0, size_t nbytes)
+__getdents64 (int fd, void *buf, size_t nbytes)
 {
-  char *buf = buf0;
-
   /* The system call takes an unsigned int argument, and some length
      checks in the kernel use an int type.  */
   if (nbytes > INT_MAX)
     nbytes = INT_MAX;
 
 #ifdef __NR_getdents64
-  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
-  if (ret != -1)
-    return ret;
+  static int getdents64_supported = true;
+  if (atomic_load_relaxed (&getdents64_supported))
+    {
+      ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
+      if (ret >= 0 || errno != ENOSYS)
+	return ret;
+
+      atomic_store_relaxed (&getdents64_supported, false);
+    }
 #endif
 
   /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.
-     If syscall is not available it need to fallback to non-LFS one.  */
+     If the syscall is not available it need to fallback to the non-LFS one.
+     Also to avoid an unbounded allocation through VLA/alloca or malloc (which
+     would make the syscall non async-signal-safe) it uses a limited buffer.
+     This is sub-optimal for large NBYTES, however this is a fallback
+     mechanism to emulate a syscall that kernel should provide.   */
 
   struct kernel_dirent
-    {
-      unsigned long d_ino;
-      unsigned long d_off;
-      unsigned short int d_reclen;
-      char d_name[256];
-    };
+  {
+#if _MIPS_SIM == _ABI64
+    uint64_t d_ino;
+    uint64_t d_off;
+#else
+    uint32_t d_ino;
+    uint32_t d_off;
+#endif
+    unsigned short int d_reclen;
+    char d_name[1];
+  };
+
+  /* The largest possible practical length of the d_name member are 255
+     Unicode characters in UTF-8 encoding, so d_name is 766 bytes long, plus
+     18 (mips64) / 10 (mips64n32) bytes from header, for total of 784 (mips64)
+     / 776 (mips64n32) bytes total.  Ensure that the minimum size holds at
+     least one entry.  */
+  enum { KBUF_SIZE = 1024 };
+  char kbuf[KBUF_SIZE];
+  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
 
   const size_t size_diff = (offsetof (struct dirent64, d_name)
-			   - offsetof (struct kernel_dirent, d_name));
+                           - offsetof (struct kernel_dirent, d_name));
 
-  size_t red_nbytes = MIN (nbytes
-			   - ((nbytes / (offsetof (struct dirent64, d_name)
-					 + 14)) * size_diff),
-			   nbytes - size_diff);
+  struct dirent64 *dp = (struct dirent64 *) buf;
 
-  struct scratch_buffer tmpbuf;
-  scratch_buffer_init (&tmpbuf);
-  if (!scratch_buffer_set_array_size (&tmpbuf, red_nbytes, sizeof (uint8_t)))
-    INLINE_SYSCALL_ERROR_RETURN_VALUE (ENOMEM);
+  size_t nb = 0;
+  off64_t last_offset = -1;
 
-  struct kernel_dirent *skdp, *kdp;
-  skdp = kdp = tmpbuf.data;
+  ssize_t r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size);
+  if (r <= 0)
+    return r;
 
-  ssize_t retval = INLINE_SYSCALL_CALL (getdents, fd, kdp, red_nbytes);
-  if (retval == -1)
-    {
-      scratch_buffer_free (&tmpbuf);
-      return -1;
-    }
+  struct kernel_dirent *skdp, *kdp;
+  skdp = kdp = (struct kernel_dirent *) kbuf;
 
-  off64_t last_offset = -1;
-  struct dirent64 *dp = (struct dirent64 *) buf;
-  while ((char *) kdp < (char *) skdp + retval)
+  while ((char *) kdp < (char *) skdp + r)
     {
-      const size_t alignment = _Alignof (struct dirent64);
-      /* Since kdp->d_reclen is already aligned for the kernel structure
-	 this may compute a value that is bigger than necessary.  */
-      size_t new_reclen = ((kdp->d_reclen + size_diff + alignment - 1)
-			   & ~(alignment - 1));
-      if ((char *) dp + new_reclen > buf + nbytes)
-        {
-	  /* Our heuristic failed.  We read too many entries.  Reset
-	     the stream.  */
-	  assert (last_offset != -1);
-	  __lseek64 (fd, last_offset, SEEK_SET);
-
-	  if ((char *) dp == buf)
+      /* This is a conservative approximation, since some of size_diff might
+	 fit into the existing padding for alignment.  */
+      size_t new_reclen = ALIGN_UP (kdp->d_reclen + size_diff,
+				    _Alignof (struct dirent64));
+      if (nb + new_reclen > nbytes)
+	{
+	  /* Entry is too large for the fixed-size buffer.  */
+	  if (last_offset == -1)
 	    {
-	      scratch_buffer_free (&tmpbuf);
-	      return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
+	      __set_errno (EINVAL);
+	      return -1;
 	    }
 
-	  break;
+	  /* The new entry will overflow the input buffer, rewind to last
+	     obtained entry and return.  */
+	  __lseek64 (fd, last_offset, SEEK_SET);
+	  return (char *) dp - (char *) buf;
 	}
-
-      last_offset = kdp->d_off;
-      dp->d_ino = kdp->d_ino;
-      dp->d_off = kdp->d_off;
-      dp->d_reclen = new_reclen;
+      nb += new_reclen;
+
+#define copy_field(dst, src)			\
+  ({						\
+     typeof (src) _src;				\
+     memcpy (&_src, &(src), sizeof (src));	\
+     typeof (dst) _dst = _src;			\
+     memcpy (&(dst), &_dst, sizeof (dst));	\
+  })
+
+      copy_field (dp->d_ino, kdp->d_ino);
+      copy_field (dp->d_off, kdp->d_off);
+      copy_field (last_offset, kdp->d_off);
+      copy_field (dp->d_reclen, new_reclen);
       dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
       memcpy (dp->d_name, kdp->d_name,
 	      kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
@@ -112,8 +132,7 @@ __getdents64 (int fd, void *buf0, size_t nbytes)
       kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
     }
 
-  scratch_buffer_free (&tmpbuf);
-  return (char *) dp - buf;
+  return (char *) dp - (char *) buf;
 }
 libc_hidden_def (__getdents64)
 weak_alias (__getdents64, getdents64)

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-09-02 13:14           ` Florian Weimer
@ 2019-09-02 19:47             ` Adhemerval Zanella
  2019-10-07 17:51               ` Adhemerval Zanella
  0 siblings, 1 reply; 59+ messages in thread
From: Adhemerval Zanella @ 2019-09-02 19:47 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha



On 02/09/2019 10:14, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> The problem is in fact false negatives, where posix_spawn will get a mask 
>> *without* the bit set, but with a set signal disposition.
> 
> Hmm.  Right.  Incidentally, the Go routine should be fine with that:
> 
> | // When using cgo, call the C library for sigaction, so that we call into
> | // any sanitizer interceptors. This supports using the memory
> | // sanitizer with Go programs. The memory sanitizer only applies to
> | // C/C++ code; this permits that code to see the Go runtime's existing signal
> | // handlers when registering new signal handlers for the process.
> | 
> | //go:cgo_import_static x_cgo_sigaction
> | //go:linkname x_cgo_sigaction x_cgo_sigaction
> | //go:linkname _cgo_sigaction _cgo_sigaction
> | var x_cgo_sigaction byte
> | var _cgo_sigaction = &x_cgo_sigaction
> 
> libjsig also keeps calling to glibc.
> 
> Is there anything else we should check?

No idea, my take on that is once you start to calling syscall directly
where libbc provide a wrapper you are in your own. We had a similar
discussing with clone usage by some container applications and their
expectation regarding libc internal state afterwards. 

> 
>> In fact I think due the syscall, even relaxed operations would work
>> (since the syscall acts a strong memory barrier).
> 
> Only as a signal fence, not a thread fence.  Some architectures can even
> keep cache inconsistency across fork system calls.
> 
> I find it a bit counter-intuitive that calling sigaction or signal
> directly without the glibc wrappers could lead to data corruption, even
> when done for standard signals such as SIGINT.  But that's what's going
> to happen with this change, unfortunately.

What is counter-intuitive imho is to rely on libc to keep its internal
consistency by bypassing it. This might be even worse if glibc start to
wrapper the signal handler as a way to implement BZ#19702, for instance.

One thing we may do it to make it clean on manual that an application is
*not* expect to call sigaction using syscall().

> 
>>>>> I wonder if we can get kernel support for this in the new clone system
>>>>> call with more flags.  Then we don't have to complicate the sigaction
>>>>> implementation.
>>>>
>>>> Maybe a CLONE_RESET_SIGNALS where the cloned process sets its signal
>>>> disposition to default SIG_IGN/SIG_DFL values may help us here.  However
>>>> afaik clone now is out of space on 'flags' for newer ones (it already
>>>> defines 24 flags plus it reserve 8 bits for signal to be sent at process
>>>> exit) and it would take time to use this feature on glibc.
>>>
>>> Christian Brauner has been working on fixing this.
>>
>> Which strategy he is proposing? Even with proper kernel support, it would
>> take time to enable glibc to use it.
> 
> Lots of flag arguments, with the reset of the arguments located
> indirectly via a pointer argument.
> 
> For a pure optimization, I think it's not too bad to require kernel
> backports of system calls.
> 
> Thanks,
> Florian
> 

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-09-02 17:38       ` Adhemerval Zanella
@ 2019-10-07 17:49         ` Adhemerval Zanella
  2019-10-07 18:29         ` Florian Weimer
  1 sibling, 0 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-10-07 17:49 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha

Ping.

On 02/09/2019 14:38, Adhemerval Zanella wrote:
> 
> 
> On 02/09/2019 09:59, Florian Weimer wrote:
>> * Adhemerval Zanella:
>>
>>>> The choice of size needs a comment.  I think the largest possible
>>>> practical length of the d_name member are 255 Unicode characters in the
>>>> BMP, in UTF-8 encoding, so d_name is 766 bytes long, plus 10 bytes from
>>>> the header, for 776 bytes total.  (NAME_MAX is not a constant on Linux
>>>> in reality.)
>>>
>>> I picked the buffer as an arbitrary value, what about:
>>>
>>>   /* The largest possible practical length of the d_name member are 255
>>>      Unicode characters in UTF-8 encoding, so d_name is 766 bytes long, plus
>>>      18 (mips64) / 10 (mips64n32) bytes from header, for total of 784 (mips64)
>>>      / 776 (mips64n32) bytes total.  Ensure that the minimum size hold at
>>>      least one entry.  */
>>>   enum { KBUF_SIZE = 1024 };
>>
>> “holds”
>>
>> Looks good.
>>
>>>>>    struct kernel_dirent
>>>>> +  {
>>>>> +    unsigned long d_ino;
>>>>> +    unsigned long d_off;
>>>>> +    unsigned short int d_reclen;
>>>>> +    char d_name[1];
>>>>> +  } kbuf[KBUF_SIZE / sizeof (struct kernel_dirent)];
>>>>> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
>>>>
>>>> I would define kbuf as a char array, and perhaps leave out the d_name
>>>> member in struct kernel_dirent.  You can copy out the struct
>>>> kernel_dirent using memcpy, which GCC should optimize away.
>>>
>>> I defined the buffer as 'struct kernel_dirent' to make it easier to align
>>> for the required fields.  It allows simplify the access on the loop to
>>> avoid memcpy calls.
>>
>> But the code is invalid C as a result of this.  We do not compile glibc
>> with -fno-strict-aliasing, after all.
> 
> Right, we indeed do some pointer arithmetic to get the kdp value. I change
> to use char buffer plus memcpy to obtain the fields.
> 
>>
>>>>>    struct dirent64 *dp = (struct dirent64 *) buf;
>>>>> +
>>>>> +  size_t nb = 0;
>>>>> +  off64_t last_offset = -1;
>>>>> +
>>>>> +  ssize_t r;
>>>>> +  while ((r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size)) > 0)
>>>>>      {
>>>>
>>>> Sorry, I don't see how the outer loop is exited.  I think we should
>>>> remove it because it does not seem necessary.
>>>
>>> We still need to handle the cases where NBYTES are larger than the temporary
>>> buffer, because it might require multiple getdents calls.
>>
>> Why?  The application (or readdir) will call us again to get more entries.
> 
> As a simple optimization to avoid it, but I think we can move to a simplified
> version.
> 
>>
>>>> Is the length really correct, though?  I'd expect it to grow by the
>>>> additional size of the d_ino and d_off members.  I think it would be
>>>> best recompute it from scratch, using the actual length of d_name.
>>>
>>> It it because you are referencing to an older patch version, checking on
>>> mips64-n32 I adjusted to:
>>>
>>>   const size_t size_diff = (offsetof (struct dirent64, d_name)
>>>                            - offsetof (struct kernel_dirent, d_name));
>>>   [...]
>>>                size_t new_reclen = ALIGN_UP (kdp->d_reclen + size_diff,
>>>                                         _Alignof (struct dirent64));
>>>   [...]
>>
>> Okay, this needs a comment that this is a conservative approximation
>> (some of size_diff might fit into the existing padding for alignment).
> 
> Ack.
> 
>>
>>>>> +	  if (nb + new_reclen > nbytes)
>>>>>  	    {
>>>>> +		/* The new entry will overflow the input buffer, rewind to
>>>>> +		   last obtained entry and return.  */
>>>>> +	       __lseek64 (fd, last_offset, SEEK_SET);
>>>>
>>>> I don't think last_offset is guaranteed to have been set with a proper
>>>> offset at this point.  Given that d_name is essentially of unbounded
>>>> length, even expanding the first entry can cause failure.
>>>>
>>>> Maybe it's possible to avoid this corner case by limiting the amount of
>>>> data being read so that we know that the application-supplied buffer is
>>>> always large enough for any possible expansion.  I think the worse-case
>>>> growth is for lengths 5 to 8, from 20 bytes to 32 bytes.  So perhaps we
>>>> should divide the buffer size by 1.6 and use that?
>>>
>>> For this case I really think we just need to return an error to user:
>>>
>>>             if (nb + new_reclen > nbytes)
>>>             {   
>>> 		/* Entry is too large for the static buffer.  */
>>
>> Fixed-size buffer, it's not static. 8-)
> 
> Ack.
> 
>>
>>> 		if (last_offset == -1)
>>> 		  {
>>> 		    __set_errno (EINVAL);
>>> 		    return -1;
>>> 		  }
>>>                 /* The new entry will overflow the input buffer, rewind to
>>>                    last obtained entry and return.  */
>>>                __lseek64 (fd, last_offset, SEEK_SET);
>>>                goto out;
>>>             }
>>>
>>> Again I see this fallback code as a best-effort since we are emulating the
>>> syscall with additional restraints.  Most of time glibc tries to play smart
>>> emulating a syscall ended in a lot of headaches...
>>
>> I don't disagree.
>>
>> Which error code does the kernel return if no entry can be read at all?
>> We should mirror that.
> 
> It returns -1/EINVAL.
> 
> fs/readdir.c
> 177         if (reclen > buf->count)                                                                    
> 178                 return -EINVAL;
> 
>>
>>>>> +	       goto out;
>>>>>  	    }
>>>>> +	  nb += new_reclen;
>>>>>  
>>>>> +	  dp->d_ino = kdp->d_ino;
>>>>> +	  dp->d_off = last_offset = kdp->d_off;
>>>>> +	  dp->d_reclen = new_reclen;
>>>>> +	  dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
>>>>
>>>> I think instead of reading through kdp, you should use char *s and
>>>> memcpy, to avoid the aliasing violation, as discussed above.  Likewise
>>>> for writing to dp.
>>>
>>> I think if we proper setting the buffer alignment there is no need to do it.
>>> Also the problem of using memcpy here is for mips64n32 the size is *not* 
>>> equal for dp and kdp, each would require an extra step as:
>>>
>>>    {
>>>      typeof (kdp->d_ino) kino;
>>>      memcpy (&kino, &kdp_d->ino, sizeof (kino));
>>>      typeof (dp->d_ino) dino = kino;
>>>      memcpy (&dp->d_ino, &kino, sizeof (dino));
>>>    }
>>
>> I think that's just the price of writing correct C.  It's also what the
>> kernel does.
> 
> Ack.
> 
>>
>> I don't even think there's a requirement that the byte buffer passed to
>> getdents64 has any kind of alignment.
>>
>> Thanks,
>> Florian
>>
> 
> Updated patch below.
> 
> --
> 
> This patch changes how the fallback getdents64 implementation calls
> non-LFS getdents by replacing the scratch_buffer with static buffer
> plus a loop on getdents calls.  This avoids the potential malloc
> call on scratch_buffer_set_array_size for large input buffer size
> at the cost of more getdents syscalls.
> 
> It also adds a small optimization for older kernels, where the first
> ENOSYS failure for getdents64 disable subsequent calls.
> 
> Check the dirent tests on a mips64-linux-gnu with getdents64 code
> disabled.
> 
> 	* sysdeps/unix/sysv/linux/mips/mips64/getdents64.c (__getdents64):
> 	Add small optimization for older kernel to avoid issuing
> 	__NR_getdents64 on each call and replace scratch_buffer usage with
> 	a static allocated buffer.
> ---
>  .../unix/sysv/linux/mips/mips64/getdents64.c  | 133 ++++++++++--------
>  1 file changed, 76 insertions(+), 57 deletions(-)
> 
> diff --git a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
> index 8bf3abb0e0..02e15a0b2e 100644
> --- a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
> +++ b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
> @@ -22,88 +22,108 @@
>  #include <assert.h>
>  #include <sys/param.h>
>  #include <unistd.h>
> -#include <scratch_buffer.h>
>  #include <limits.h>
>  
> +#include <include/libc-pointer-arith.h>
> +
>  ssize_t
> -__getdents64 (int fd, void *buf0, size_t nbytes)
> +__getdents64 (int fd, void *buf, size_t nbytes)
>  {
> -  char *buf = buf0;
> -
>    /* The system call takes an unsigned int argument, and some length
>       checks in the kernel use an int type.  */
>    if (nbytes > INT_MAX)
>      nbytes = INT_MAX;
>  
>  #ifdef __NR_getdents64
> -  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
> -  if (ret != -1)
> -    return ret;
> +  static int getdents64_supported = true;
> +  if (atomic_load_relaxed (&getdents64_supported))
> +    {
> +      ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
> +      if (ret >= 0 || errno != ENOSYS)
> +	return ret;
> +
> +      atomic_store_relaxed (&getdents64_supported, false);
> +    }
>  #endif
>  
>    /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.
> -     If syscall is not available it need to fallback to non-LFS one.  */
> +     If the syscall is not available it need to fallback to the non-LFS one.
> +     Also to avoid an unbounded allocation through VLA/alloca or malloc (which
> +     would make the syscall non async-signal-safe) it uses a limited buffer.
> +     This is sub-optimal for large NBYTES, however this is a fallback
> +     mechanism to emulate a syscall that kernel should provide.   */
>  
>    struct kernel_dirent
> -    {
> -      unsigned long d_ino;
> -      unsigned long d_off;
> -      unsigned short int d_reclen;
> -      char d_name[256];
> -    };
> +  {
> +#if _MIPS_SIM == _ABI64
> +    uint64_t d_ino;
> +    uint64_t d_off;
> +#else
> +    uint32_t d_ino;
> +    uint32_t d_off;
> +#endif
> +    unsigned short int d_reclen;
> +    char d_name[1];
> +  };
> +
> +  /* The largest possible practical length of the d_name member are 255
> +     Unicode characters in UTF-8 encoding, so d_name is 766 bytes long, plus
> +     18 (mips64) / 10 (mips64n32) bytes from header, for total of 784 (mips64)
> +     / 776 (mips64n32) bytes total.  Ensure that the minimum size holds at
> +     least one entry.  */
> +  enum { KBUF_SIZE = 1024 };
> +  char kbuf[KBUF_SIZE];
> +  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
>  
>    const size_t size_diff = (offsetof (struct dirent64, d_name)
> -			   - offsetof (struct kernel_dirent, d_name));
> +                           - offsetof (struct kernel_dirent, d_name));
>  
> -  size_t red_nbytes = MIN (nbytes
> -			   - ((nbytes / (offsetof (struct dirent64, d_name)
> -					 + 14)) * size_diff),
> -			   nbytes - size_diff);
> +  struct dirent64 *dp = (struct dirent64 *) buf;
>  
> -  struct scratch_buffer tmpbuf;
> -  scratch_buffer_init (&tmpbuf);
> -  if (!scratch_buffer_set_array_size (&tmpbuf, red_nbytes, sizeof (uint8_t)))
> -    INLINE_SYSCALL_ERROR_RETURN_VALUE (ENOMEM);
> +  size_t nb = 0;
> +  off64_t last_offset = -1;
>  
> -  struct kernel_dirent *skdp, *kdp;
> -  skdp = kdp = tmpbuf.data;
> +  ssize_t r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size);
> +  if (r <= 0)
> +    return r;
>  
> -  ssize_t retval = INLINE_SYSCALL_CALL (getdents, fd, kdp, red_nbytes);
> -  if (retval == -1)
> -    {
> -      scratch_buffer_free (&tmpbuf);
> -      return -1;
> -    }
> +  struct kernel_dirent *skdp, *kdp;
> +  skdp = kdp = (struct kernel_dirent *) kbuf;
>  
> -  off64_t last_offset = -1;
> -  struct dirent64 *dp = (struct dirent64 *) buf;
> -  while ((char *) kdp < (char *) skdp + retval)
> +  while ((char *) kdp < (char *) skdp + r)
>      {
> -      const size_t alignment = _Alignof (struct dirent64);
> -      /* Since kdp->d_reclen is already aligned for the kernel structure
> -	 this may compute a value that is bigger than necessary.  */
> -      size_t new_reclen = ((kdp->d_reclen + size_diff + alignment - 1)
> -			   & ~(alignment - 1));
> -      if ((char *) dp + new_reclen > buf + nbytes)
> -        {
> -	  /* Our heuristic failed.  We read too many entries.  Reset
> -	     the stream.  */
> -	  assert (last_offset != -1);
> -	  __lseek64 (fd, last_offset, SEEK_SET);
> -
> -	  if ((char *) dp == buf)
> +      /* This is a conservative approximation, since some of size_diff might
> +	 fit into the existing padding for alignment.  */
> +      size_t new_reclen = ALIGN_UP (kdp->d_reclen + size_diff,
> +				    _Alignof (struct dirent64));
> +      if (nb + new_reclen > nbytes)
> +	{
> +	  /* Entry is too large for the fixed-size buffer.  */
> +	  if (last_offset == -1)
>  	    {
> -	      scratch_buffer_free (&tmpbuf);
> -	      return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
> +	      __set_errno (EINVAL);
> +	      return -1;
>  	    }
>  
> -	  break;
> +	  /* The new entry will overflow the input buffer, rewind to last
> +	     obtained entry and return.  */
> +	  __lseek64 (fd, last_offset, SEEK_SET);
> +	  return (char *) dp - (char *) buf;
>  	}
> -
> -      last_offset = kdp->d_off;
> -      dp->d_ino = kdp->d_ino;
> -      dp->d_off = kdp->d_off;
> -      dp->d_reclen = new_reclen;
> +      nb += new_reclen;
> +
> +#define copy_field(dst, src)			\
> +  ({						\
> +     typeof (src) _src;				\
> +     memcpy (&_src, &(src), sizeof (src));	\
> +     typeof (dst) _dst = _src;			\
> +     memcpy (&(dst), &_dst, sizeof (dst));	\
> +  })
> +
> +      copy_field (dp->d_ino, kdp->d_ino);
> +      copy_field (dp->d_off, kdp->d_off);
> +      copy_field (last_offset, kdp->d_off);
> +      copy_field (dp->d_reclen, new_reclen);
>        dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
>        memcpy (dp->d_name, kdp->d_name,
>  	      kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
> @@ -112,8 +132,7 @@ __getdents64 (int fd, void *buf0, size_t nbytes)
>        kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
>      }
>  
> -  scratch_buffer_free (&tmpbuf);
> -  return (char *) dp - buf;
> +  return (char *) dp - (char *) buf;
>  }
>  libc_hidden_def (__getdents64)
>  weak_alias (__getdents64, getdents64)
> 

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 3/5] posix: Optimize stack Linux posix_spawn
  2019-08-28 14:09   ` Adhemerval Zanella
@ 2019-10-07 17:50     ` Adhemerval Zanella
  0 siblings, 0 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-10-07 17:50 UTC (permalink / raw)
  To: libc-alpha

Ping (x2).

On 28/08/2019 11:09, Adhemerval Zanella wrote:
> Ping.
> 
> On 31/07/2019 15:31, Adhemerval Zanella wrote:
>> Changes from previous version:
>>
>>   * Move the logic of stack mapping creation to stackmap.h and
>>     added a guard page allocation for the compatibility case.
>>
>> --
>>
>> The current internal posix_spawn symbol for Linux (__spawni) requires
>> to allocate a dynamic stack based on input arguments to handle the
>> SPAWN_XFLAGS_USE_PATH internal flag, which re-issue the input binary
>> as a shell script if execve call return ENOEXEC (to execute shell
>> scripts with an initial shebang).
>>
>> This is done only for compatibility mode and the generic case does not
>> require the extra calculation plus the potential large mmap/munmap
>> call.  For default case, a pre-defined buffer is sufficed to use on the
>> clone call instead.
>>
>> This patch optimizes Linux spawni by allocating a dynamic stack only
>> for compatibility symbol (SPAWN_XFLAGS_USE_PATH).  For generic case,
>> an mmap allocated buffer is used along with a guard page, similar to
>> what NPTL uses for thread stacks hardening.
>>
>> For default case, it is a fixed code path with fixed stack usage in helper
>> process, so assuming a large enough stack buffer it would never overflow.
>> It also does not prevent to adapt to the vfork-like to re-use process
>> stack, once it is implemented.
>>
>> Checked x86_64-linux-gnu and i686-linux-gnu.
>>
>> 	* sysdeps/unix/sysv/linux/spawni.c (posix_spawn_args): Remove
>> 	argc member.
>> 	(maybe_script_execute): Remove function.
>> 	(execve_compat, __spawni_clone, __spawnix_compat): New function.
>> 	(__spawni_child): Remove maybe_script_execute call.
>> 	(__spawnix): Remove magic stack slack constant with stack_slack
>> 	identifier.
>> 	(__spawni): Only allocates a variable stack when
>> 	SPAWN_XFLAGS_TRY_SHELL is used.
>> 	* posix/stackmap.h: New file.
>> 	* sysdeps/ia64/nptl/pthreaddef.h (NEED_SEPARATE_REGISTER_STACK): Move
>> 	to ...
>> 	* sysdeps/ia64/stackinfo.h: ... here.
>> ---
>>  posix/stackmap.h                 | 115 +++++++++++++
>>  sysdeps/ia64/nptl/pthreaddef.h   |   3 -
>>  sysdeps/ia64/stackinfo.h         |   3 +
>>  sysdeps/unix/sysv/linux/spawni.c | 277 +++++++++++++++++++------------
>>  4 files changed, 285 insertions(+), 113 deletions(-)
>>  create mode 100644 posix/stackmap.h
>>
>> diff --git a/posix/stackmap.h b/posix/stackmap.h
>> new file mode 100644
>> index 0000000000..be500e378a
>> --- /dev/null
>> +++ b/posix/stackmap.h
>> @@ -0,0 +1,115 @@
>> +/* Functions to create stack mappings for helper processes.
>> +   Copyright (C) 2019 Free Software Foundation, Inc.
>> +   This file is part of the GNU C Library.
>> +
>> +   The GNU C Library is free software; you can redistribute it and/or
>> +   modify it under the terms of the GNU Lesser General Public
>> +   License as published by the Free Software Foundation; either
>> +   version 2.1 of the License, or (at your option) any later version.
>> +
>> +   The GNU C Library is distributed in the hope that it will be useful,
>> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> +   Lesser General Public License for more details.
>> +
>> +   You should have received a copy of the GNU Lesser General Public
>> +   License along with the GNU C Library; if not, see
>> +   <http://www.gnu.org/licenses/>.  */
>> +
>> +#ifndef _STACKMAP_H
>> +#define _STACKMAP_H
>> +
>> +#include <unistd.h>
>> +#include <sys/mman.h>
>> +#include <ldsodefs.h>
>> +#include <stdbool.h>
>> +
>> +static inline int
>> +stack_prot (void)
>> +{
>> +  return (PROT_READ | PROT_WRITE
>> +	  | ((GL(dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
>> +}
>> +
>> +static inline size_t
>> +stack_guard_size (void)
>> +{
>> + return GLRO (dl_pagesize);
>> +}
>> +
>> +/* Return a aligning mask based on system pagesize.  */
>> +static inline size_t
>> +stack_pagesize_m1_mask (void)
>> +{
>> +  size_t pagesize_m1 = __getpagesize () - 1;
>> +  return ~pagesize_m1;
>> +}
>> +
>> +/* Return the guard page position on memory segment MEM with total size SIZE
>> +   and with a guard page of size GUARDIZE.  */
>> +static inline void *
>> +stack_guard_position (void *mem, size_t size, size_t guardsize)
>> +{
>> +#ifdef NEED_SEPARATE_REGISTER_STACK
>> +  return mem + (((size - guardsize) / 2) & stack_pagesize_m1_mask ());
>> +#elif _STACK_GROWS_DOWN
>> +  return mem;
>> +#elif _STACK_GROWS_UP
>> +  return (void *) (((uintptr_t)(mem + size)- guardsize)
>> +		   & stack_pagesize_m1_mask ());
>> +#endif
>> +}
>> +
>> +/* Setup the expected stack memory protection value (based on stack_prot)
>> +   for the memory segment MEM with size SIZE based on the guard page
>> +   GUARD with size GUARDSIZE.  The memory segment is expected to be allocated
>> +   with PROT_NOTE.  */
>> +static inline bool
>> +stack_setup_prot (char *mem, size_t size, char *guard, size_t guardsize)
>> +{
>> +  const int prot = stack_prot ();
>> +
>> +  char *guardend = guard + guardsize;
>> +#if _STACK_GROWS_DOWN && !defined(NEED_SEPARATE_REGISTER_STACK)
>> +  /* As defined at guard_position, for architectures with downward stack
>> +     the guard page is always at start of the allocated area.  */
>> +  if (__mprotect (guardend, size - guardsize, prot) != 0)
>> +    return false;
>> +#else
>> +  size_t mprots1 = (uintptr_t) guard - (uintptr_t) mem;
>> +  if (__mprotect (mem, mprots1, prot) != 0)
>> +    return false;
>> +  size_t mprots2 = ((uintptr_t) mem + size) - (uintptr_t) guardend;
>> +  if (__mprotect (guardend, mprots2, prot) != 0)
>> +    return false;
>> +#endif
>> +  return true;
>> +}
>> +
>> +/* Allocated a memory segment with size SIZE plus GUARSIZE with mmap and
>> +   setup the expected protection for both a guard page and the stack
>> +   itself.  */
>> +static inline void *
>> +stack_allocate (size_t size, size_t guardsize)
>> +{
>> +  const int prot = stack_prot ();
>> +
>> +  /* If a guard page is required, avoid committing memory by first
>> +     allocate with PROT_NONE and then reserve with required permission
>> +     excluding the guard page.  */
>> +  void *mem = __mmap (NULL, size, (guardsize == 0) ? prot : PROT_NONE,
>> +		      MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
>> +  if (guardsize)
>> +    {
>> +      void *guard = stack_guard_position (mem, size, guardsize);
>> +      if (!stack_setup_prot (mem, size, guard, guardsize))
>> +	{
>> +	  __munmap (mem, size);
>> +	  return MAP_FAILED;
>> +	}
>> +    }
>> +
>> +  return mem;
>> +}
>> +
>> +#endif /* _STACKMAP_H  */
>> diff --git a/sysdeps/ia64/nptl/pthreaddef.h b/sysdeps/ia64/nptl/pthreaddef.h
>> index bf52d5af62..11579f11b4 100644
>> --- a/sysdeps/ia64/nptl/pthreaddef.h
>> +++ b/sysdeps/ia64/nptl/pthreaddef.h
>> @@ -18,9 +18,6 @@
>>  /* Default stack size.  */
>>  #define ARCH_STACK_DEFAULT_SIZE	(32 * 1024 * 1024)
>>  
>> -/* IA-64 uses a normal stack and a register stack.  */
>> -#define NEED_SEPARATE_REGISTER_STACK
>> -
>>  /* Required stack pointer alignment at beginning.  */
>>  #define STACK_ALIGN		16
>>  
>> diff --git a/sysdeps/ia64/stackinfo.h b/sysdeps/ia64/stackinfo.h
>> index 6433a89945..d942426fcf 100644
>> --- a/sysdeps/ia64/stackinfo.h
>> +++ b/sysdeps/ia64/stackinfo.h
>> @@ -30,4 +30,7 @@
>>  /* Default to a non-executable stack.  */
>>  #define DEFAULT_STACK_PERMS (PF_R|PF_W)
>>  
>> +/* IA-64 uses a normal stack and a register stack.  */
>> +#define NEED_SEPARATE_REGISTER_STACK
>> +
>>  #endif	/* stackinfo.h */
>> diff --git a/sysdeps/unix/sysv/linux/spawni.c b/sysdeps/unix/sysv/linux/spawni.c
>> index ca7bf99825..0f7a8ca5df 100644
>> --- a/sysdeps/unix/sysv/linux/spawni.c
>> +++ b/sysdeps/unix/sysv/linux/spawni.c
>> @@ -23,10 +23,11 @@
>>  #include <not-cancel.h>
>>  #include <local-setxid.h>
>>  #include <shlib-compat.h>
>> -#include <sigsetops.h>
>> -#include <internal-signals.h>
>> -#include <ldsodefs.h>
>> +#include <nptl/pthreadP.h>
>>  #include <ctype.h>
>> +#include <dl-sysdep.h>
>> +#include <libc-pointer-arith.h>
>> +#include <stackmap.h>
>>  #include "spawn_int.h"
>>  
>>  /* The Linux implementation of posix_spawn{p} uses the clone syscall directly
>> @@ -70,7 +71,6 @@
>>  # define STACK(__stack, __stack_size) (__stack + __stack_size)
>>  #endif
>>  
>> -
>>  struct posix_spawn_args
>>  {
>>    sigset_t oldmask;
>> @@ -79,37 +79,11 @@ struct posix_spawn_args
>>    const posix_spawn_file_actions_t *fa;
>>    const posix_spawnattr_t *restrict attr;
>>    char *const *argv;
>> -  ptrdiff_t argc;
>>    char *const *envp;
>>    int xflags;
>>    int err;
>>  };
>>  
>> -/* Older version requires that shell script without shebang definition
>> -   to be called explicitly using /bin/sh (_PATH_BSHELL).  */
>> -static void
>> -maybe_script_execute (struct posix_spawn_args *args)
>> -{
>> -  if (SHLIB_COMPAT (libc, GLIBC_2_2, GLIBC_2_15)
>> -      && (args->xflags & SPAWN_XFLAGS_TRY_SHELL) && errno == ENOEXEC)
>> -    {
>> -      char *const *argv = args->argv;
>> -      ptrdiff_t argc = args->argc;
>> -
>> -      /* Construct an argument list for the shell.  */
>> -      char *new_argv[argc + 2];
>> -      new_argv[0] = (char *) _PATH_BSHELL;
>> -      new_argv[1] = (char *) args->file;
>> -      if (argc > 1)
>> -	memcpy (new_argv + 2, argv + 1, argc * sizeof (char *));
>> -      else
>> -	new_argv[2] = NULL;
>> -
>> -      /* Execute the shell.  */
>> -      args->exec (new_argv[0], new_argv, args->envp);
>> -    }
>> -}
>> -
>>  /* Close all file descriptor up to FROM by interacting /proc/self/fd.  */
>>  static bool
>>  spawn_closefrom (int from)
>> @@ -152,7 +126,7 @@ spawn_closefrom (int from)
>>     attributes, and file actions.  It run on its own stack (provided by the
>>     posix_spawn call).  */
>>  static int
>> -__spawni_child (void *arguments)
>> +spawni_child (void *arguments)
>>  {
>>    struct posix_spawn_args *args = arguments;
>>    const posix_spawnattr_t *restrict attr = args->attr;
>> @@ -330,11 +304,6 @@ __spawni_child (void *arguments)
>>  
>>    args->exec (args->file, args->argv, args->envp);
>>  
>> -  /* This is compatibility function required to enable posix_spawn run
>> -     script without shebang definition for older posix_spawn versions
>> -     (2.15).  */
>> -  maybe_script_execute (args);
>> -
>>  fail:
>>    /* errno should have an appropriate non-zero value; otherwise,
>>       there's a bug in glibc or the kernel.  For lack of an error code
>> @@ -345,71 +314,12 @@ fail:
>>    _exit (SPAWN_ERROR);
>>  }
>>  
>> -/* Spawn a new process executing PATH with the attributes describes in *ATTRP.
>> -   Before running the process perform the actions described in FILE-ACTIONS. */
>>  static int
>> -__spawnix (pid_t * pid, const char *file,
>> -	   const posix_spawn_file_actions_t * file_actions,
>> -	   const posix_spawnattr_t * attrp, char *const argv[],
>> -	   char *const envp[], int xflags,
>> -	   int (*exec) (const char *, char *const *, char *const *))
>> +spawni_clone (struct posix_spawn_args *args, void *stack, size_t stack_size,
>> +	      pid_t *pid)
>>  {
>> -  pid_t new_pid;
>> -  struct posix_spawn_args args;
>>    int ec;
>> -
>> -  /* To avoid imposing hard limits on posix_spawn{p} the total number of
>> -     arguments is first calculated to allocate a mmap to hold all possible
>> -     values.  */
>> -  ptrdiff_t argc = 0;
>> -  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
>> -     to be used in a execve call.  We limit to INT_MAX minus one due the
>> -     compatiblity code that may execute a shell script (maybe_script_execute)
>> -     where it will construct another argument list with an additional
>> -     argument.  */
>> -  ptrdiff_t limit = INT_MAX - 1;
>> -  while (argv[argc++] != NULL)
>> -    if (argc == limit)
>> -      {
>> -	errno = E2BIG;
>> -	return errno;
>> -      }
>> -
>> -  int prot = (PROT_READ | PROT_WRITE
>> -	     | ((GL (dl_stack_flags) & PF_X) ? PROT_EXEC : 0));
>> -
>> -  size_t argv_size = (argc * sizeof (void *));
>> -  /* We need at least a few pages in case the compiler's stack checking is
>> -     enabled.  In some configs, it is known to use at least 24KiB.  We use
>> -     32KiB to be "safe" from anything the compiler might do.  Besides, the
>> -     extra pages won't actually be allocated unless they get used.
>> -     It also acts the slack for spawn_closefrom (including MIPS64 getdents64
>> -     where it might use about 1k extra stack space.  */
>> -  argv_size += (32 * 1024);
>> -  size_t stack_size = ALIGN_UP (argv_size, GLRO(dl_pagesize));
>> -  void *stack = __mmap (NULL, stack_size, prot,
>> -			MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
>> -  if (__glibc_unlikely (stack == MAP_FAILED))
>> -    return errno;
>> -
>> -  /* Disable asynchronous cancellation.  */
>> -  int state;
>> -  __libc_ptf_call (__pthread_setcancelstate,
>> -                   (PTHREAD_CANCEL_DISABLE, &state), 0);
>> -
>> -  /* Child must set args.err to something non-negative - we rely on
>> -     the parent and child sharing VM.  */
>> -  args.err = 0;
>> -  args.file = file;
>> -  args.exec = exec;
>> -  args.fa = file_actions;
>> -  args.attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 };
>> -  args.argv = argv;
>> -  args.argc = argc;
>> -  args.envp = envp;
>> -  args.xflags = xflags;
>> -
>> -  __libc_signal_block_all (&args.oldmask);
>> +  pid_t new_pid;
>>  
>>    /* The clone flags used will create a new child that will run in the same
>>       memory space (CLONE_VM) and the execution of calling thread will be
>> @@ -419,8 +329,8 @@ __spawnix (pid_t * pid, const char *file,
>>       need for CLONE_SETTLS.  Although parent and child share the same TLS
>>       namespace, there will be no concurrent access for TLS variables (errno
>>       for instance).  */
>> -  new_pid = CLONE (__spawni_child, STACK (stack, stack_size), stack_size,
>> -		   CLONE_VM | CLONE_VFORK | SIGCHLD, &args);
>> +  new_pid = CLONE (spawni_child, STACK (stack, stack_size), stack_size,
>> +		   CLONE_VM | CLONE_VFORK | SIGCHLD, args);
>>  
>>    /* It needs to collect the case where the auxiliary process was created
>>       but failed to execute the file (due either any preparation step or
>> @@ -433,7 +343,7 @@ __spawnix (pid_t * pid, const char *file,
>>  	 only in case of failure, so in case of premature termination
>>  	 due a signal args.err will remain zeroed and it will be up to
>>  	 caller to actually collect it.  */
>> -      ec = args.err;
>> +      ec = args->err;
>>        if (ec > 0)
>>  	/* There still an unlikely case where the child is cancelled after
>>  	   setting args.err, due to a positive error value.  Also there is
>> @@ -446,14 +356,139 @@ __spawnix (pid_t * pid, const char *file,
>>    else
>>      ec = -new_pid;
>>  
>> -  __munmap (stack, stack_size);
>> -
>>    if ((ec == 0) && (pid != NULL))
>>      *pid = new_pid;
>>  
>> -  __libc_signal_restore_set (&args.oldmask);
>> +  return ec;
>> +}
>>  
>> -  __libc_ptf_call (__pthread_setcancelstate, (state, NULL), 0);
>> +#if SHLIB_COMPAT (libc, GLIBC_2_2, GLIBC_2_15)
>> +/* This is compatibility function required to enable posix_spawn run
>> +   script without shebang definition for older posix_spawn versions
>> +   (2.15).  */
>> +static int
>> +execve_compat (const char *filename, char *const argv[], char *const envp[])
>> +{
>> +  __execve (filename, argv, envp);
>> +
>> +  if (errno == ENOEXEC)
>> +    {
>> +      char *const *cargv = argv;
>> +      ptrdiff_t argc = 0;
>> +      while (cargv[argc++] != NULL);
>> +
>> +      /* Construct an argument list for the shell.  */
>> +      char *new_argv[argc + 2];
>> +      new_argv[0] = (char *) _PATH_BSHELL;
>> +      new_argv[1] = (char *) filename;
>> +      if (argc > 1)
>> +	memcpy (new_argv + 2, argv + 1, argc * sizeof (char *));
>> +      else
>> +	new_argv[2] = NULL;
>> +
>> +      /* Execute the shell.  */
>> +      __execve (new_argv[0], new_argv, envp);
>> +    }
>> +
>> +  return -1;
>> +}
>> +
>> +/* Allocates a stack using mmap to call clone.  The stack size is based on
>> +   number of arguments since it would be used on compat mode which may call
>> +   execvpe/execve_compat.  */
>> +static int
>> +spawnix_compat (struct posix_spawn_args *args, pid_t *pid)
>> +{
>> +  char *const *argv = args->argv;
>> +
>> +  /* To avoid imposing hard limits on posix_spawn{p} the total number of
>> +     arguments is first calculated to allocate a mmap to hold all possible
>> +     values.  */
>> +  ptrdiff_t argc = 0;
>> +  /* Linux allows at most max (0x7FFFFFFF, 1/4 stack size) arguments
>> +     to be used in a execve call.  We limit to INT_MAX minus one due the
>> +     compatiblity code that may execute a shell script (maybe_script_execute)
>> +     where it will construct another argument list with an additional
>> +     argument.  */
>> +  ptrdiff_t limit = INT_MAX - 1;
>> +  while (argv[argc++] != NULL)
>> +    if (argc == limit)
>> +      {
>> +	errno = E2BIG;
>> +	return errno;
>> +      }
>> +
>> +  size_t argv_size = (argc * sizeof (void *));
>> +  /* We need at least a few pages in case the compiler's stack checking is
>> +     enabled.  In some configs, it is known to use at least 24KiB.  We use
>> +     32KiB to be "safe" from anything the compiler might do.  Besides, the
>> +     extra pages won't actually be allocated unless they get used.
>> +     It also acts the slack for spawn_closefrom (including MIPS64 getdents64
>> +     where it might use about 1k extra stack space.  */
>> +  argv_size += (32 * 1024);
>> +
>> +  /* Allocate a stack with an extra guard page.  */
>> +  size_t guard_size = stack_guard_size ();
>> +  size_t stack_size = guard_size + ALIGN_UP (argv_size, __getpagesize ());
>> +  void *stack = stack_allocate (stack_size, guard_size);
>> +  if (__glibc_unlikely (stack == MAP_FAILED))
>> +    return errno;
>> +
>> +  int ec = spawni_clone (args, stack, stack_size, pid);
>> +
>> +  __munmap (stack, stack_size);
>> +
>> +  return ec;
>> +}
>> +#endif
>> +
>> +/* For SPAWN_XFLAGS_TRY_SHELL we need to execute a script even without
>> +   a shebang.  To accomplish it we pass as callback to spawni_child
>> +   __execvpe (which call maybe_script_execute for such case) or
>> +   execve_compat (which mimics the semantic using execve).  */
>> +static int
>> +spawn_process (struct posix_spawn_args *args, pid_t *pid)
>> +{
>> +  int ec;
>> +
>> +#if SHLIB_COMPAT (libc, GLIBC_2_2, GLIBC_2_15)
>> +  if (args->xflags & SPAWN_XFLAGS_TRY_SHELL)
>> +    {
>> +      args->exec = args->xflags & SPAWN_XFLAGS_USE_PATH
>> +		   ? __execvpe  : execve_compat;
>> +      ec = spawnix_compat (args, pid);
>> +    }
>> +  else
>> +#endif
>> +    {
>> +      args->exec = args->xflags & SPAWN_XFLAGS_USE_PATH
>> +		   ? __execvpex : __execve;
>> +
>> +      /* spawni_clone stack usage need to take in consideration spawni_child
>> +	 stack usage and subsequent functions called:
>> +
>> +	 - sigprocmask: might allocate an extra sigset_t (128 bytes).
>> +	 - __libc_sigaction: allocate a struct kernel_sigaction (144 bytes on
>> +	   64-bit, 136 on 32-bit).
>> +	 - __sched_setparam, __sched_setscheduler, __setsig, __setpgid,
>> +	   local_seteuid, local_setegid, __close_nocancel, __getrlimit64,
>> +	   __close_nocancel, __open_nocancel, __dup2, __chdir, __fchdir:
>> +	   and direct syscall.
>> +	 - __fcntl: wrapper only uses local variables.
>> +	 - spawn_closefrom: uses up to 1024 bytes as local buffer
>> +	   - __direntries_read
>> +	     - __getdents64: MIPS64 uses up to buffer size used, 1024 in this
>> +	       specific usage.
>> +	   - __direntries_next: local variables.
>> +	   - __close_nocancel: direct syscall.
>> +         - execvpe allocates at least (NAME_MAX + 1) + PATH_MAX to create the
>> +	   combination of PATH entry and program name (1024 + 255 + 1).
>> +
>> +	 It allocates 2048 plus some stack for automatic variables and function
>> +	 calls.  */
>> +      char stack[2560];
>> +      ec = spawni_clone (args, stack, sizeof stack, pid);
>> +    }
>>  
>>    return ec;
>>  }
>> @@ -462,12 +497,34 @@ __spawnix (pid_t * pid, const char *file,
>>     Before running the process perform the actions described in FILE-ACTIONS. */
>>  int
>>  __spawni (pid_t * pid, const char *file,
>> -	  const posix_spawn_file_actions_t * acts,
>> +	  const posix_spawn_file_actions_t * file_actions,
>>  	  const posix_spawnattr_t * attrp, char *const argv[],
>>  	  char *const envp[], int xflags)
>>  {
>> -  /* It uses __execvpex to avoid run ENOEXEC in non compatibility mode (it
>> -     will be handled by maybe_script_execute).  */
>> -  return __spawnix (pid, file, acts, attrp, argv, envp, xflags,
>> -		    xflags & SPAWN_XFLAGS_USE_PATH ? __execvpex :__execve);
>> +  /* Child must set args.err to something non-negative - we rely on
>> +     the parent and child sharing VM.  */
>> +  struct posix_spawn_args args = {
>> +    .err = 0,
>> +    .file = file,
>> +    .fa = file_actions,
>> +    .attr = attrp ? attrp : &(const posix_spawnattr_t) { 0 },
>> +    .argv = argv,
>> +    .envp = envp,
>> +    .xflags = xflags
>> +  };
>> +
>> +  /* Disable asynchronous cancellation.  */
>> +  int state;
>> +  __libc_ptf_call (__pthread_setcancelstate,
>> +                   (PTHREAD_CANCEL_DISABLE, &state), 0);
>> +
>> +  __libc_signal_block_all (&args.oldmask);
>> +
>> +  int ec = spawn_process (&args, pid);
>> +
>> +  __libc_signal_restore_set (&args.oldmask);
>> +
>> +  __libc_ptf_call (__pthread_setcancelstate, (state, NULL), 0);
>> +
>> +  return ec;
>>  }
>>

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-09-02 19:47             ` Adhemerval Zanella
@ 2019-10-07 17:51               ` Adhemerval Zanella
  2019-10-07 18:25                 ` Christian Brauner
  2019-10-07 18:41                 ` Florian Weimer
  0 siblings, 2 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-10-07 17:51 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha

Florian, do you still hold objection to this patch?

On 02/09/2019 16:47, Adhemerval Zanella wrote:
> 
> 
> On 02/09/2019 10:14, Florian Weimer wrote:
>> * Adhemerval Zanella:
>>
>>> The problem is in fact false negatives, where posix_spawn will get a mask 
>>> *without* the bit set, but with a set signal disposition.
>>
>> Hmm.  Right.  Incidentally, the Go routine should be fine with that:
>>
>> | // When using cgo, call the C library for sigaction, so that we call into
>> | // any sanitizer interceptors. This supports using the memory
>> | // sanitizer with Go programs. The memory sanitizer only applies to
>> | // C/C++ code; this permits that code to see the Go runtime's existing signal
>> | // handlers when registering new signal handlers for the process.
>> | 
>> | //go:cgo_import_static x_cgo_sigaction
>> | //go:linkname x_cgo_sigaction x_cgo_sigaction
>> | //go:linkname _cgo_sigaction _cgo_sigaction
>> | var x_cgo_sigaction byte
>> | var _cgo_sigaction = &x_cgo_sigaction
>>
>> libjsig also keeps calling to glibc.
>>
>> Is there anything else we should check?
> 
> No idea, my take on that is once you start to calling syscall directly
> where libbc provide a wrapper you are in your own. We had a similar
> discussing with clone usage by some container applications and their
> expectation regarding libc internal state afterwards. 
> 
>>
>>> In fact I think due the syscall, even relaxed operations would work
>>> (since the syscall acts a strong memory barrier).
>>
>> Only as a signal fence, not a thread fence.  Some architectures can even
>> keep cache inconsistency across fork system calls.
>>
>> I find it a bit counter-intuitive that calling sigaction or signal
>> directly without the glibc wrappers could lead to data corruption, even
>> when done for standard signals such as SIGINT.  But that's what's going
>> to happen with this change, unfortunately.
> 
> What is counter-intuitive imho is to rely on libc to keep its internal
> consistency by bypassing it. This might be even worse if glibc start to
> wrapper the signal handler as a way to implement BZ#19702, for instance.
> 
> One thing we may do it to make it clean on manual that an application is
> *not* expect to call sigaction using syscall().
> 
>>
>>>>>> I wonder if we can get kernel support for this in the new clone system
>>>>>> call with more flags.  Then we don't have to complicate the sigaction
>>>>>> implementation.
>>>>>
>>>>> Maybe a CLONE_RESET_SIGNALS where the cloned process sets its signal
>>>>> disposition to default SIG_IGN/SIG_DFL values may help us here.  However
>>>>> afaik clone now is out of space on 'flags' for newer ones (it already
>>>>> defines 24 flags plus it reserve 8 bits for signal to be sent at process
>>>>> exit) and it would take time to use this feature on glibc.
>>>>
>>>> Christian Brauner has been working on fixing this.
>>>
>>> Which strategy he is proposing? Even with proper kernel support, it would
>>> take time to enable glibc to use it.
>>
>> Lots of flag arguments, with the reset of the arguments located
>> indirectly via a pointer argument.
>>
>> For a pure optimization, I think it's not too bad to require kernel
>> backports of system calls.
>>
>> Thanks,
>> Florian
>>

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 5/5] posix: Use posix_spawn for wordexp
  2019-08-28 14:10   ` Adhemerval Zanella
@ 2019-10-07 17:51     ` Adhemerval Zanella
  0 siblings, 0 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-10-07 17:51 UTC (permalink / raw)
  To: libc-alpha

Ping (x2).

On 28/08/2019 11:09, Adhemerval Zanella wrote:
> Ping.
> 
> On 31/07/2019 15:31, Adhemerval Zanella wrote:
>> Change from previous version:
>>
>>   - Use libsupport and remove atfork usage on posix/wordexp-test.c.
>>
>> --
>>
>> This patch replaces the fork+exec by posix_spawn on wordexp, which
>> allows a better scability on Linux and simplifies the thread
>> cancellation handling.
>>
>> The only change which can not be implemented with posix_spawn the
>> /dev/null check to certify it is indeed the expected device.  I am
>> not sure how effetive this check is since /dev/null tampering means
>> something very wrong with the system and this is the least of the
>> issues.  My view is the tests is really out of the place and the
>> hardening provided is minimum.
>>
>> If the idea is still to provide such check, I think a possibilty
>> would be to open /dev/null, check it, add a dup2 file action, and
>> close the file descriptor.
>>
>> Checked on powerpc64le-linux-gnu and x86_64-linux-gnu.
>>
>> 	* include/spawn.h (__posix_spawn_file_actions_addopen): New
>> 	prototype.
>> 	* posix/spawn_faction_addopen.c (posix_spawn_file_actions_addopen):
>> 	Add internal alias.
>> 	* posix/wordexp.c (create_environment, free_environment): New
>> 	functions.
>> 	(exec_comm_child, exec_comm): Use posix_spawn instead of fork+exec.
>> 	* posix/wordexp-test.c: Use libsupport and remove atfork usage.
>> ---
>>  include/spawn.h               |   3 +
>>  posix/spawn_faction_addopen.c |   8 +-
>>  posix/wordexp-test.c          | 142 +++++++++--------------------
>>  posix/wordexp.c               | 167 ++++++++++++++++------------------
>>  4 files changed, 129 insertions(+), 191 deletions(-)
>>
>> diff --git a/include/spawn.h b/include/spawn.h
>> index 7fdd965bd7..4a0b1849da 100644
>> --- a/include/spawn.h
>> +++ b/include/spawn.h
>> @@ -11,6 +11,9 @@ __typeof (posix_spawn_file_actions_addclose)
>>  __typeof (posix_spawn_file_actions_adddup2)
>>    __posix_spawn_file_actions_adddup2 attribute_hidden;
>>  
>> +__typeof (posix_spawn_file_actions_addopen)
>> +  __posix_spawn_file_actions_addopen attribute_hidden;
>> +
>>  __typeof (posix_spawn_file_actions_destroy)
>>    __posix_spawn_file_actions_destroy attribute_hidden;
>>  
>> diff --git a/posix/spawn_faction_addopen.c b/posix/spawn_faction_addopen.c
>> index 742eb9526d..2e598de300 100644
>> --- a/posix/spawn_faction_addopen.c
>> +++ b/posix/spawn_faction_addopen.c
>> @@ -25,9 +25,9 @@
>>  /* Add an action to FILE-ACTIONS which tells the implementation to call
>>     `open' for the given file during the `spawn' call.  */
>>  int
>> -posix_spawn_file_actions_addopen (posix_spawn_file_actions_t *file_actions,
>> -				  int fd, const char *path, int oflag,
>> -				  mode_t mode)
>> +__posix_spawn_file_actions_addopen (posix_spawn_file_actions_t *file_actions,
>> +				    int fd, const char *path, int oflag,
>> +				    mode_t mode)
>>  {
>>    struct __spawn_action *rec;
>>  
>> @@ -60,3 +60,5 @@ posix_spawn_file_actions_addopen (posix_spawn_file_actions_t *file_actions,
>>  
>>    return 0;
>>  }
>> +weak_alias (__posix_spawn_file_actions_addopen,
>> +	    posix_spawn_file_actions_addopen)
>> diff --git a/posix/wordexp-test.c b/posix/wordexp-test.c
>> index 10a0768a6b..ef780b0a65 100644
>> --- a/posix/wordexp-test.c
>> +++ b/posix/wordexp-test.c
>> @@ -15,39 +15,21 @@
>>     License along with the GNU C Library; if not, see
>>     <http://www.gnu.org/licenses/>.  */
>>  
>> -#include <sys/stat.h>
>> -#include <sys/types.h>
>> -#include <sys/mman.h>
>> +#include <wordexp.h>
>> +#include <stdio.h>
>>  #include <fcntl.h>
>> -#include <unistd.h>
>>  #include <pwd.h>
>> -#include <stdio.h>
>> -#include <stdint.h>
>>  #include <stdlib.h>
>>  #include <string.h>
>> -#include <wordexp.h>
>> +#include <sys/mman.h>
>> +
>>  #include <libc-pointer-arith.h>
>> -#include <dso_handle.h>
>> +#include <array_length.h>
>> +#include <support/xunistd.h>
>> +#include <support/check.h>
>>  
>>  #define IFS " \n\t"
>>  
>> -extern int __register_atfork (void (*) (void), void (*) (void), void (*) (void), void *);
>> -
>> -static int __app_register_atfork (void (*prepare) (void), void (*parent) (void), void (*child) (void))
>> -{
>> -  return __register_atfork (prepare, parent, child, __dso_handle);
>> -}
>> -
>> -/* Number of forks seen.  */
>> -static int registered_forks;
>> -
>> -/* For each fork increment the fork count.  */
>> -static void
>> -register_fork (void)
>> -{
>> -  registered_forks++;
>> -}
>> -
>>  struct test_case_struct
>>  {
>>    int retval;
>> @@ -57,7 +39,7 @@ struct test_case_struct
>>    size_t wordc;
>>    const char *wordv[10];
>>    const char *ifs;
>> -} test_case[] =
>> +} static test_case[] =
>>    {
>>      /* Simple word- and field-splitting */
>>      { 0, NULL, "one", 0, 1, { "one", }, IFS },
>> @@ -238,8 +220,6 @@ struct test_case_struct
>>      { WRDE_SYNTAX, NULL, "${", 0, 0, { NULL, }, IFS },      /* BZ 18043  */
>>      { WRDE_SYNTAX, NULL, "L${a:", 0, 0, { NULL, }, IFS },   /* BZ 18043#c4  */
>>      { WRDE_SYNTAX, NULL, "$[1/0]", WRDE_NOCMD, 0, {NULL, }, IFS }, /* BZ 18100 */
>> -
>> -    { -1, NULL, NULL, 0, 0, { NULL, }, IFS },
>>    };
>>  
>>  static int testit (struct test_case_struct *tc);
>> @@ -256,16 +236,14 @@ command_line_test (const char *words)
>>      printf ("we_wordv[%d] = \"%s\"\n", i, we.we_wordv[i]);
>>  }
>>  
>> -int
>> -main (int argc, char *argv[])
>> +static int
>> +do_test (int argc, char *argv[])
>>  {
>> -  const char *globfile[] = { "one", "two", "three", NULL };
>> +  const char *globfile[] = { "one", "two", "three" };
>>    char tmpdir[32];
>>    struct passwd *pw;
>>    const char *cwd;
>>    int test;
>> -  int fail = 0;
>> -  int i;
>>    struct test_case_struct ts;
>>  
>>    if (argc > 1)
>> @@ -278,30 +256,18 @@ main (int argc, char *argv[])
>>  
>>    /* Set up arena for pathname expansion */
>>    tmpnam (tmpdir);
>> -  if (mkdir (tmpdir, S_IRWXU) || chdir (tmpdir))
>> -    return -1;
>> -  else
>> -    {
>> -      int fd;
>> +  xmkdir (tmpdir, S_IRWXU);
>> +  TEST_VERIFY_EXIT (chdir (tmpdir) == 0);
>>  
>> -      for (i = 0; globfile[i]; ++i)
>> -	if ((fd = creat (globfile[i], S_IRUSR | S_IWUSR)) == -1
>> -	    || close (fd))
>> -	  return -1;
>> -    }
>> -
>> -  /* If we are not allowed to do command substitution, we install
>> -     fork handlers to verify that no forks happened.  No forks should
>> -     happen at all if command substitution is disabled.  */
>> -  if (__app_register_atfork (register_fork, NULL, NULL) != 0)
>> +  for (int i = 0; i < array_length (globfile); ++i)
>>      {
>> -      printf ("Failed to register fork handler.\n");
>> -      return -1;
>> +      int fd = xopen (globfile[i], O_WRONLY|O_CREAT|O_TRUNC,
>> +		      S_IRUSR | S_IWUSR);
>> +      xclose (fd);
>>      }
>>  
>> -  for (test = 0; test_case[test].retval != -1; test++)
>> -    if (testit (&test_case[test]))
>> -      ++fail;
>> +  for (test = 0; test < array_length (test_case); test++)
>> +    TEST_COMPARE (testit (&test_case[test]), 0);
>>  
>>    /* Tilde-expansion tests. */
>>    pw = getpwnam ("root");
>> @@ -315,8 +281,7 @@ main (int argc, char *argv[])
>>        ts.wordv[0] = pw->pw_dir;
>>        ts.ifs = IFS;
>>  
>> -      if (testit (&ts))
>> -	++fail;
>> +      TEST_COMPARE (testit (&ts), 0);
>>  
>>        ts.retval = 0;
>>        ts.env = pw->pw_dir;
>> @@ -326,8 +291,7 @@ main (int argc, char *argv[])
>>        ts.wordv[0] = "x";
>>        ts.ifs = IFS;
>>  
>> -      if (testit (&ts))
>> -	++fail;
>> +      TEST_COMPARE (testit (&ts), 0);
>>      }
>>  
>>    /* "~" expands to value of $HOME when HOME is set */
>> @@ -342,8 +306,7 @@ main (int argc, char *argv[])
>>    ts.wordv[1] = "/dummy/home/foo";
>>    ts.ifs = IFS;
>>  
>> -  if (testit (&ts))
>> -    ++fail;
>> +  TEST_COMPARE (testit (&ts), 0);
>>  
>>    /* "~" expands to home dir from passwd file if HOME is not set */
>>  
>> @@ -359,8 +322,7 @@ main (int argc, char *argv[])
>>        ts.wordv[0] = pw->pw_dir;
>>        ts.ifs = IFS;
>>  
>> -      if (testit (&ts))
>> -	++fail;
>> +      TEST_COMPARE (testit (&ts), 0);
>>      }
>>  
>>    /* Integer overflow in division.  */
>> @@ -375,37 +337,32 @@ main (int argc, char *argv[])
>>        "18446744073709551616",
>>        "170141183460469231731687303715884105728",
>>        "340282366920938463463374607431768211456",
>> -      NULL
>>      };
>>  
>> -    for (const char *const *num = numbers; *num; ++num)
>> +    for (int i = 0; i < array_length (numbers); i++)
>>        {
>>  	wordexp_t p;
>>  	char pattern[256];
>> -	snprintf (pattern, sizeof (pattern), "$[(-%s)/(-1)]", *num);
>> +	snprintf (pattern, sizeof (pattern), "$[(-%s)/(-1)]", numbers[i]);
>>  	int ret = wordexp (pattern, &p, WRDE_NOCMD);
>>  	if (ret == 0)
>>  	  {
>> -	    if (p.we_wordc != 1 || strcmp (p.we_wordv[0], *num) != 0)
>> -	      {
>> -		printf ("Integer overflow for \"%s\" failed", pattern);
>> -		++fail;
>> -	      }
>> +	    TEST_COMPARE (p.we_wordc, 1);
>> +	    TEST_COMPARE (strcmp (p.we_wordv[0], numbers[i]), 0);
>>  	    wordfree (&p);
>>  	  }
>> -	else if (ret != WRDE_SYNTAX)
>> +	else
>>  	  {
>> -	    printf ("Integer overflow for \"%s\" failed with %d",
>> -		    pattern, ret);
>> -	    ++fail;
>> +	    TEST_COMPARE (ret, WRDE_SYNTAX);
>> +	    if (ret != WRDE_SYNTAX)
>> +	      printf ("Integer overflow for \"%s\" failed with %d",
>> +		      pattern, ret);
>>  	  }
>>        }
>>    }
>>  
>> -  puts ("tests completed, now cleaning up");
>> -
>>    /* Clean up */
>> -  for (i = 0; globfile[i]; ++i)
>> +  for (int i = 0; i < array_length (globfile); ++i)
>>      remove (globfile[i]);
>>  
>>    if (cwd == NULL)
>> @@ -414,26 +371,17 @@ main (int argc, char *argv[])
>>    chdir (cwd);
>>    rmdir (tmpdir);
>>  
>> -  printf ("tests failed: %d\n", fail);
>> -
>> -  return fail != 0;
>> +  return 0;
>>  }
>>  
>>  static const char *
>>  at_page_end (const char *words)
>>  {
>>    const int pagesize = getpagesize ();
>> -  char *start = mmap (0, 2 * pagesize, PROT_READ|PROT_WRITE,
>> -		      MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
>> +  char *start = xmmap (0, 2 * pagesize, PROT_READ | PROT_WRITE,
>> +		       MAP_PRIVATE | MAP_ANONYMOUS, -1);
>>  
>> -  if (start == MAP_FAILED)
>> -    return start;
>> -
>> -  if (mprotect (start + pagesize, pagesize, PROT_NONE))
>> -    {
>> -      munmap (start, 2 * pagesize);
>> -      return MAP_FAILED;
>> -    }
>> +  xmprotect (start + pagesize, pagesize, PROT_NONE);
>>  
>>    /* Includes terminating NUL.  */
>>    const size_t words_size = strlen (words) + 1;
>> @@ -472,9 +420,6 @@ testit (struct test_case_struct *tc)
>>    fflush (NULL);
>>    const char *words = at_page_end (tc->words);
>>  
>> -  if (tc->flags & WRDE_NOCMD)
>> -    registered_forks = 0;
>> -
>>    if (tc->flags & WRDE_APPEND)
>>      {
>>        /* initial wordexp() call, to be appended to */
>> @@ -486,13 +431,6 @@ testit (struct test_case_struct *tc)
>>      }
>>    retval = wordexp (words, &we, tc->flags);
>>  
>> -  if ((tc->flags & WRDE_NOCMD)
>> -      && (registered_forks > 0))
>> -    {
>> -	  printf ("FAILED fork called for WRDE_NOCMD\n");
>> -	  return 1;
>> -    }
>> -
>>    if (tc->flags & WRDE_DOOFFS)
>>        start_offs = sav_we.we_offs;
>>  
>> @@ -551,9 +489,11 @@ testit (struct test_case_struct *tc)
>>    const int page_size = getpagesize ();
>>    char *start = (char *) PTR_ALIGN_DOWN (words, page_size);
>>  
>> -  if (munmap (start, 2 * page_size) != 0)
>> -    return 1;
>> +  xmunmap (start, 2 * page_size);
>>  
>>    fflush (NULL);
>>    return bzzzt;
>>  }
>> +
>> +#define TEST_FUNCTION_ARGV do_test
>> +#include <support/test-driver.c>
>> diff --git a/posix/wordexp.c b/posix/wordexp.c
>> index 22c6d18a9c..e1aafcaceb 100644
>> --- a/posix/wordexp.c
>> +++ b/posix/wordexp.c
>> @@ -25,33 +25,18 @@
>>  #include <libintl.h>
>>  #include <paths.h>
>>  #include <pwd.h>
>> -#include <signal.h>
>>  #include <stdbool.h>
>>  #include <stdio.h>
>> -#include <stdlib.h>
>>  #include <string.h>
>>  #include <sys/param.h>
>> -#include <sys/stat.h>
>> -#include <sys/time.h>
>> -#include <sys/types.h>
>> -#include <sys/types.h>
>>  #include <sys/wait.h>
>>  #include <unistd.h>
>> -#include <wchar.h>
>>  #include <wordexp.h>
>> -#include <kernel-features.h>
>> +#include <spawn.h>
>>  #include <scratch_buffer.h>
>> -
>> -#include <libc-lock.h>
>>  #include <_itoa.h>
>> -
>> -/* Undefine the following line for the production version.  */
>> -/* #define NDEBUG 1 */
>>  #include <assert.h>
>>  
>> -/* Get some device information.  */
>> -#include <device-nrs.h>
>> -
>>  /*
>>   * This is a recursive-descent-style word expansion routine.
>>   */
>> @@ -812,61 +797,90 @@ parse_arith (char **word, size_t *word_length, size_t *max_length,
>>    return WRDE_SYNTAX;
>>  }
>>  
>> +static char **
>> +create_environment (void)
>> +{
>> +  size_t s = 0;
>> +
>> +  /* Calculate total environment size, including 'IFS' if is present.  */
>> +  for (char **ep = __environ; *ep != NULL; ep++, s++);
>> +
>> +  /* Include final NULL pointer.  */
>> +  char **newenviron = malloc (s * sizeof (char*));
>> +  if (newenviron == NULL)
>> +    return NULL;
>> +
>> +  /* Copy current environment excluding 'IFS', to make sure the subshell
>> +     doesn't field-split on our behalf. */
>> +  size_t i, j;
>> +  for (i = 0, j = 0; i < s; i++)
>> +    if (strncmp (__environ[i], "IFS=", sizeof ("IFS=")-1) != 0)
>> +      newenviron[j++] = __strdup (__environ[i]);
>> +  newenviron[j] = NULL;
>> +
>> +  return newenviron;
>> +}
>> +
>> +static void
>> +free_environment (char **environ)
>> +{
>> +  for (char **ep = environ; *ep != NULL; ep++)
>> +    free (*ep);
>> +  free (environ);
>> +}
>> +
>>  /* Function called by child process in exec_comm() */
>> -static inline void
>> -__attribute__ ((always_inline))
>> -exec_comm_child (char *comm, int *fildes, int showerr, int noexec)
>> +static pid_t
>> +exec_comm_child (char *comm, int *fildes, bool showerr, bool noexec)
>>  {
>> -  const char *args[4] = { _PATH_BSHELL, "-c", comm, NULL };
>> +  pid_t pid = -1;
>>  
>> -  /* Execute the command, or just check syntax? */
>> -  if (noexec)
>> -    args[1] = "-nc";
>> +  /* Execute the command, or just check syntax?  */
>> +  const char *args[] = { _PATH_BSHELL, noexec ? "-nc" : "-c", comm, NULL };
>>  
>> -  /* Redirect output.  */
>> -  if (__glibc_likely (fildes[1] != STDOUT_FILENO))
>> -    {
>> -      __dup2 (fildes[1], STDOUT_FILENO);
>> -      __close (fildes[1]);
>> -    }
>> -  else
>> -    /* Reset the close-on-exec flag (if necessary).  */
>> -    __fcntl (fildes[1], F_SETFD, 0);
>> +  posix_spawn_file_actions_t fa;
>> +  /* posix_spawn_file_actions_init does not fail.  */
>> +  __posix_spawn_file_actions_init (&fa);
>>  
>> -  /* Redirect stderr to /dev/null if we have to.  */
>> -  if (showerr == 0)
>> +  /* Redirect output.  For check syntax only (noexec being true), exec_comm
>> +     explicits sets fildes[1] to -1, so check its value to avoid a failure in
>> +     __posix_spawn_file_actions_adddup2.  */
>> +  if (fildes[1] != -1)
>>      {
>> -      struct stat64 st;
>> -      int fd;
>> -      __close (STDERR_FILENO);
>> -      fd = __open (_PATH_DEVNULL, O_WRONLY);
>> -      if (fd >= 0 && fd != STDERR_FILENO)
>> +      if (__glibc_likely (fildes[1] != STDOUT_FILENO))
>>  	{
>> -	  __dup2 (fd, STDERR_FILENO);
>> -	  __close (fd);
>> +	  if (__posix_spawn_file_actions_adddup2 (&fa, fildes[1],
>> +						  STDOUT_FILENO) != 0
>> +	      || __posix_spawn_file_actions_addclose (&fa, fildes[1]) != 0)
>> +	    goto out;
>>  	}
>> -      /* Be paranoid.  Check that we actually opened the /dev/null
>> -	 device.  */
>> -      if (__builtin_expect (__fxstat64 (_STAT_VER, STDERR_FILENO, &st), 0) != 0
>> -	  || __builtin_expect (S_ISCHR (st.st_mode), 1) == 0
>> -#if defined DEV_NULL_MAJOR && defined DEV_NULL_MINOR
>> -	  || st.st_rdev != __gnu_dev_makedev (DEV_NULL_MAJOR, DEV_NULL_MINOR)
>> -#endif
>> -	  )
>> -	/* It's not the /dev/null device.  Stop right here.  The
>> -	   problem is: how do we stop?  We use _exit() with an
>> -	   hopefully unusual exit code.  */
>> -	_exit (90);
>> +      else
>> +	/* Reset the close-on-exec flag (if necessary).  */
>> +	if (__posix_spawn_file_actions_adddup2 (&fa, fildes[1], fildes[1])
>> +	    != 0)
>> +	  goto out;
>>      }
>>  
>> -  /* Make sure the subshell doesn't field-split on our behalf. */
>> -  __unsetenv ("IFS");
>> +  /* Redirect stderr to /dev/null if we have to.  */
>> +  if (!showerr)
>> +    if (__posix_spawn_file_actions_addopen (&fa, STDERR_FILENO, _PATH_DEVNULL,
>> +					    O_WRONLY, 0) != 0)
>> +      goto out;
>> +
>> +  char **newenv = create_environment ();
>> +  if (newenv == NULL)
>> +    goto out;
>>  
>> -  __close (fildes[0]);
>> -  __execve (_PATH_BSHELL, (char *const *) args, __environ);
>> +  /* pid is unset if posix_spawn fails, so it keep the original value
>> +     of -1.  */
>> +  __posix_spawn (&pid, _PATH_BSHELL, &fa, NULL, (char *const *) args, newenv);
>>  
>> -  /* Bad.  What now?  */
>> -  abort ();
>> +  free_environment (newenv);
>> +
>> +out:
>> +  __posix_spawn_file_actions_destroy (&fa);
>> +
>> +  return pid;
>>  }
>>  
>>  /* Function to execute a command and retrieve the results */
>> @@ -884,13 +898,13 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
>>    size_t maxnewlines = 0;
>>    char buffer[bufsize];
>>    pid_t pid;
>> -  int noexec = 0;
>> +  bool noexec = false;
>>  
>>    /* Do nothing if command substitution should not succeed.  */
>>    if (flags & WRDE_NOCMD)
>>      return WRDE_CMDSUB;
>>  
>> -  /* Don't fork() unless necessary */
>> +  /* Don't posix_spawn() unless necessary */
>>    if (!comm || !*comm)
>>      return 0;
>>  
>> @@ -898,19 +912,15 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
>>      return WRDE_NOSPACE;
>>  
>>   again:
>> -  if ((pid = __fork ()) < 0)
>> +  pid = exec_comm_child (comm, fildes, noexec ? false : flags & WRDE_SHOWERR,
>> +			 noexec);
>> +  if (pid < 0)
>>      {
>> -      /* Bad */
>>        __close (fildes[0]);
>>        __close (fildes[1]);
>>        return WRDE_NOSPACE;
>>      }
>>  
>> -  if (pid == 0)
>> -    exec_comm_child (comm, fildes, noexec ? 0 : flags & WRDE_SHOWERR, noexec);
>> -
>> -  /* Parent */
>> -
>>    /* If we are just testing the syntax, only wait.  */
>>    if (noexec)
>>      return (TEMP_FAILURE_RETRY (__waitpid (pid, &status, 0)) == pid
>> @@ -1091,7 +1101,7 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
>>    /* Check for syntax error (re-execute but with "-n" flag) */
>>    if (buflen < 1 && status != 0)
>>      {
>> -      noexec = 1;
>> +      noexec = true;
>>        goto again;
>>      }
>>  
>> @@ -1143,26 +1153,9 @@ parse_comm (char **word, size_t *word_length, size_t *max_length,
>>  	      /* Go -- give script to the shell */
>>  	      if (comm)
>>  		{
>> -#ifdef __libc_ptf_call
>> -		  /* We do not want the exec_comm call to be cut short
>> -		     by a thread cancellation since cleanup is very
>> -		     ugly.  Therefore disable cancellation for
>> -		     now.  */
>> -		  // XXX Ideally we do want the thread being cancelable.
>> -		  // XXX If demand is there we'll change it.
>> -		  int state = PTHREAD_CANCEL_ENABLE;
>> -		  __libc_ptf_call (__pthread_setcancelstate,
>> -				   (PTHREAD_CANCEL_DISABLE, &state), 0);
>> -#endif
>> -
>> +		  /* posix_spawn already handles thread cancellation.  */
>>  		  error = exec_comm (comm, word, word_length, max_length,
>>  				     flags, pwordexp, ifs, ifs_white);
>> -
>> -#ifdef __libc_ptf_call
>> -		  __libc_ptf_call (__pthread_setcancelstate,
>> -				   (state, NULL), 0);
>> -#endif
>> -
>>  		  free (comm);
>>  		}
>>  
>>

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-10-07 17:51               ` Adhemerval Zanella
@ 2019-10-07 18:25                 ` Christian Brauner
  2019-10-07 18:32                   ` Florian Weimer
  2019-10-07 18:35                   ` Adhemerval Zanella
  2019-10-07 18:41                 ` Florian Weimer
  1 sibling, 2 replies; 59+ messages in thread
From: Christian Brauner @ 2019-10-07 18:25 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: Florian Weimer, libc-alpha

On Mon, Oct 07, 2019 at 02:50:56PM -0300, Adhemerval Zanella wrote:
> Florian, do you still hold objection to this patch?
> 
> On 02/09/2019 16:47, Adhemerval Zanella wrote:
> > 
> > 
> > On 02/09/2019 10:14, Florian Weimer wrote:
> >> * Adhemerval Zanella:
> >>
> >>> The problem is in fact false negatives, where posix_spawn will get a mask 
> >>> *without* the bit set, but with a set signal disposition.
> >>
> >> Hmm.  Right.  Incidentally, the Go routine should be fine with that:
> >>
> >> | // When using cgo, call the C library for sigaction, so that we call into
> >> | // any sanitizer interceptors. This supports using the memory
> >> | // sanitizer with Go programs. The memory sanitizer only applies to
> >> | // C/C++ code; this permits that code to see the Go runtime's existing signal
> >> | // handlers when registering new signal handlers for the process.
> >> | 
> >> | //go:cgo_import_static x_cgo_sigaction
> >> | //go:linkname x_cgo_sigaction x_cgo_sigaction
> >> | //go:linkname _cgo_sigaction _cgo_sigaction
> >> | var x_cgo_sigaction byte
> >> | var _cgo_sigaction = &x_cgo_sigaction
> >>
> >> libjsig also keeps calling to glibc.
> >>
> >> Is there anything else we should check?
> > 
> > No idea, my take on that is once you start to calling syscall directly
> > where libbc provide a wrapper you are in your own. We had a similar
> > discussing with clone usage by some container applications and their
> > expectation regarding libc internal state afterwards. 
> > 
> >>
> >>> In fact I think due the syscall, even relaxed operations would work
> >>> (since the syscall acts a strong memory barrier).
> >>
> >> Only as a signal fence, not a thread fence.  Some architectures can even
> >> keep cache inconsistency across fork system calls.
> >>
> >> I find it a bit counter-intuitive that calling sigaction or signal
> >> directly without the glibc wrappers could lead to data corruption, even
> >> when done for standard signals such as SIGINT.  But that's what's going
> >> to happen with this change, unfortunately.
> > 
> > What is counter-intuitive imho is to rely on libc to keep its internal
> > consistency by bypassing it. This might be even worse if glibc start to
> > wrapper the signal handler as a way to implement BZ#19702, for instance.
> > 
> > One thing we may do it to make it clean on manual that an application is
> > *not* expect to call sigaction using syscall().
> > 
> >>
> >>>>>> I wonder if we can get kernel support for this in the new clone system
> >>>>>> call with more flags.  Then we don't have to complicate the sigaction
> >>>>>> implementation.
> >>>>>
> >>>>> Maybe a CLONE_RESET_SIGNALS where the cloned process sets its signal
> >>>>> disposition to default SIG_IGN/SIG_DFL values may help us here.  However
> >>>>> afaik clone now is out of space on 'flags' for newer ones (it already
> >>>>> defines 24 flags plus it reserve 8 bits for signal to be sent at process
> >>>>> exit) and it would take time to use this feature on glibc.
> >>>>
> >>>> Christian Brauner has been working on fixing this.
> >>>
> >>> Which strategy he is proposing? Even with proper kernel support, it would
> >>> take time to enable glibc to use it.
> >>
> >> Lots of flag arguments, with the reset of the arguments located
> >> indirectly via a pointer argument.
> >>
> >> For a pure optimization, I think it's not too bad to require kernel
> >> backports of system calls.

So I just accidently caught wind of this discussion. :)
I'm open to extending clone3() to support something like the above.
My new clone3() version has been released with Linux 5.3. It takes a
struct clone_args. The structure is versioned by size and thus - in
theory - extensible indefinitely.

(I also sent a PR for v5.4-rc2 that got merged for the
copy_struct_from_user() work from Aleksa. It adds a common helper for
copying structure arguments version by size. This will guarantee that
future syscalls will all use the same size-versioning logic (Yes, we
need to be careful with unions.).)

[1]: fork: add clone3
     https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7f192e3cd316ba58c88dfa26796cf77789dd9872

[2]: lib: introduce copy_struct_from_user() helper
     https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f5a1a536fa14895ccff4e94e6a5af90901ce86aa

Christian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-09-02 17:38       ` Adhemerval Zanella
  2019-10-07 17:49         ` Adhemerval Zanella
@ 2019-10-07 18:29         ` Florian Weimer
  2019-10-08 17:38           ` Adhemerval Zanella
  1 sibling, 1 reply; 59+ messages in thread
From: Florian Weimer @ 2019-10-07 18:29 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

* Adhemerval Zanella:

> +#define copy_field(dst, src)			\
> +  ({						\
> +     typeof (src) _src;				\
> +     memcpy (&_src, &(src), sizeof (src));	\
> +     typeof (dst) _dst = _src;			\
> +     memcpy (&(dst), &_dst, sizeof (dst));	\
> +  })
> +
> +      copy_field (dp->d_ino, kdp->d_ino);
> +      copy_field (dp->d_off, kdp->d_off);
> +      copy_field (last_offset, kdp->d_off);
> +      copy_field (dp->d_reclen, new_reclen);
>        dp->d_type = *((char *) kdp + kdp->d_reclen - 1);

I believe this still asserts the dynamic type of *dp, which is not what
we want.  The truly portable way probably involves using offsetof and
not -> dereferencing. 8-(

Considering that, I would probably drop copy_field, compile the file
with -fno-strict-aliasing, and add a comment to the (now plain
assignments) that this is okay due to -fno-strict-aliasing.

But this is really up to you, I do not want to discuss this patch to
death.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-10-07 18:25                 ` Christian Brauner
@ 2019-10-07 18:32                   ` Florian Weimer
  2019-10-07 21:08                     ` Christian Brauner
  2019-10-07 18:35                   ` Adhemerval Zanella
  1 sibling, 1 reply; 59+ messages in thread
From: Florian Weimer @ 2019-10-07 18:32 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Adhemerval Zanella, libc-alpha

* Christian Brauner:

>> >>>>> Maybe a CLONE_RESET_SIGNALS where the cloned process sets its signal
>> >>>>> disposition to default SIG_IGN/SIG_DFL values may help us here.  However
>> >>>>> afaik clone now is out of space on 'flags' for newer ones (it already
>> >>>>> defines 24 flags plus it reserve 8 bits for signal to be sent at process
>> >>>>> exit) and it would take time to use this feature on glibc.
>> >>>>
>> >>>> Christian Brauner has been working on fixing this.
>> >>>
>> >>> Which strategy he is proposing? Even with proper kernel support, it would
>> >>> take time to enable glibc to use it.
>> >>
>> >> Lots of flag arguments, with the reset of the arguments located
>> >> indirectly via a pointer argument.
>> >>
>> >> For a pure optimization, I think it's not too bad to require kernel
>> >> backports of system calls.
>
> So I just accidently caught wind of this discussion. :)

Good. 8-)

> I'm open to extending clone3() to support something like the above.
> My new clone3() version has been released with Linux 5.3. It takes a
> struct clone_args. The structure is versioned by size and thus - in
> theory - extensible indefinitely.

Christian, would you be able to implement the CLONE_RESET_SIGNALS flag
for us?  It should reset any handler which is not SIG_IGN or SIG_DFL to
SIG_DFL.  We'd also need a way to probe that the flag is supported, so
that we can fall back to the current way of doing things otherwise.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-10-07 18:25                 ` Christian Brauner
  2019-10-07 18:32                   ` Florian Weimer
@ 2019-10-07 18:35                   ` Adhemerval Zanella
  2019-10-07 18:40                     ` Florian Weimer
  2019-10-07 21:00                     ` Joseph Myers
  1 sibling, 2 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-10-07 18:35 UTC (permalink / raw)
  To: Christian Brauner; +Cc: Florian Weimer, libc-alpha



On 07/10/2019 15:25, Christian Brauner wrote:
> On Mon, Oct 07, 2019 at 02:50:56PM -0300, Adhemerval Zanella wrote:
>> Florian, do you still hold objection to this patch?
>>
>> On 02/09/2019 16:47, Adhemerval Zanella wrote:
>>>
>>>
>>> On 02/09/2019 10:14, Florian Weimer wrote:
>>>> * Adhemerval Zanella:
>>>>
>>>>> The problem is in fact false negatives, where posix_spawn will get a mask 
>>>>> *without* the bit set, but with a set signal disposition.
>>>>
>>>> Hmm.  Right.  Incidentally, the Go routine should be fine with that:
>>>>
>>>> | // When using cgo, call the C library for sigaction, so that we call into
>>>> | // any sanitizer interceptors. This supports using the memory
>>>> | // sanitizer with Go programs. The memory sanitizer only applies to
>>>> | // C/C++ code; this permits that code to see the Go runtime's existing signal
>>>> | // handlers when registering new signal handlers for the process.
>>>> | 
>>>> | //go:cgo_import_static x_cgo_sigaction
>>>> | //go:linkname x_cgo_sigaction x_cgo_sigaction
>>>> | //go:linkname _cgo_sigaction _cgo_sigaction
>>>> | var x_cgo_sigaction byte
>>>> | var _cgo_sigaction = &x_cgo_sigaction
>>>>
>>>> libjsig also keeps calling to glibc.
>>>>
>>>> Is there anything else we should check?
>>>
>>> No idea, my take on that is once you start to calling syscall directly
>>> where libbc provide a wrapper you are in your own. We had a similar
>>> discussing with clone usage by some container applications and their
>>> expectation regarding libc internal state afterwards. 
>>>
>>>>
>>>>> In fact I think due the syscall, even relaxed operations would work
>>>>> (since the syscall acts a strong memory barrier).
>>>>
>>>> Only as a signal fence, not a thread fence.  Some architectures can even
>>>> keep cache inconsistency across fork system calls.
>>>>
>>>> I find it a bit counter-intuitive that calling sigaction or signal
>>>> directly without the glibc wrappers could lead to data corruption, even
>>>> when done for standard signals such as SIGINT.  But that's what's going
>>>> to happen with this change, unfortunately.
>>>
>>> What is counter-intuitive imho is to rely on libc to keep its internal
>>> consistency by bypassing it. This might be even worse if glibc start to
>>> wrapper the signal handler as a way to implement BZ#19702, for instance.
>>>
>>> One thing we may do it to make it clean on manual that an application is
>>> *not* expect to call sigaction using syscall().
>>>
>>>>
>>>>>>>> I wonder if we can get kernel support for this in the new clone system
>>>>>>>> call with more flags.  Then we don't have to complicate the sigaction
>>>>>>>> implementation.
>>>>>>>
>>>>>>> Maybe a CLONE_RESET_SIGNALS where the cloned process sets its signal
>>>>>>> disposition to default SIG_IGN/SIG_DFL values may help us here.  However
>>>>>>> afaik clone now is out of space on 'flags' for newer ones (it already
>>>>>>> defines 24 flags plus it reserve 8 bits for signal to be sent at process
>>>>>>> exit) and it would take time to use this feature on glibc.
>>>>>>
>>>>>> Christian Brauner has been working on fixing this.
>>>>>
>>>>> Which strategy he is proposing? Even with proper kernel support, it would
>>>>> take time to enable glibc to use it.
>>>>
>>>> Lots of flag arguments, with the reset of the arguments located
>>>> indirectly via a pointer argument.
>>>>
>>>> For a pure optimization, I think it's not too bad to require kernel
>>>> backports of system calls.
> 
> So I just accidently caught wind of this discussion. :)
> I'm open to extending clone3() to support something like the above.
> My new clone3() version has been released with Linux 5.3. It takes a
> struct clone_args. The structure is versioned by size and thus - in
> theory - extensible indefinitely.
> 
> (I also sent a PR for v5.4-rc2 that got merged for the
> copy_struct_from_user() work from Aleksa. It adds a common helper for
> copying structure arguments version by size. This will guarantee that
> future syscalls will all use the same size-versioning logic (Yes, we
> need to be careful with unions.).)
> 
> [1]: fork: add clone3
>      https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=7f192e3cd316ba58c88dfa26796cf77789dd9872
> 
> [2]: lib: introduce copy_struct_from_user() helper
>      https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f5a1a536fa14895ccff4e94e6a5af90901ce86aa
> 
> Christian
> 

Yeah, I am aware of it and ideally if my patch eventually gets merged
I will probably add a code path to use clone3 when possible (not sure
if it worth to be enable only for --enable-kernel or a main path with
ENOSYS fallback to clone).

However, glibc supports older kernels as old as v3.2 and it will take
some years and releases to make v5.3 or new the minimum support kernel.
And I think it would be nice to have this optimization even for older
kernels.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-10-07 18:35                   ` Adhemerval Zanella
@ 2019-10-07 18:40                     ` Florian Weimer
  2019-10-07 19:20                       ` Adhemerval Zanella
  2019-10-07 21:00                     ` Joseph Myers
  1 sibling, 1 reply; 59+ messages in thread
From: Florian Weimer @ 2019-10-07 18:40 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: Christian Brauner, libc-alpha

* Adhemerval Zanella:

> However, glibc supports older kernels as old as v3.2 and it will take
> some years and releases to make v5.3 or new the minimum support kernel.
> And I think it would be nice to have this optimization even for older
> kernels.

But wouldn't it make sense to backport clone3 to these older kernels, so
that further enhancements are possible, in cooperation with the kernel.

I'm all for supporting older kernels, but performance optimizations for
old kernels, merely to work around missing system call/flag support for
things which are straightforward to backport into the kernel seems not
the right priority to me, sorry.

If things can be fixed in the kernel, fix it there.  That applies to
both performance and functionality bugs.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-10-07 17:51               ` Adhemerval Zanella
  2019-10-07 18:25                 ` Christian Brauner
@ 2019-10-07 18:41                 ` Florian Weimer
  1 sibling, 0 replies; 59+ messages in thread
From: Florian Weimer @ 2019-10-07 18:41 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

* Adhemerval Zanella:

> Florian, do you still hold objection to this patch?

Yes, I still don't like it.  Sorry.

Is this really important to you?  Have seen this showing up in profiles?

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-10-07 18:40                     ` Florian Weimer
@ 2019-10-07 19:20                       ` Adhemerval Zanella
  2019-10-09  9:37                         ` Florian Weimer
  0 siblings, 1 reply; 59+ messages in thread
From: Adhemerval Zanella @ 2019-10-07 19:20 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Christian Brauner, libc-alpha



On 07/10/2019 15:40, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> However, glibc supports older kernels as old as v3.2 and it will take
>> some years and releases to make v5.3 or new the minimum support kernel.
>> And I think it would be nice to have this optimization even for older
>> kernels.
> 
> But wouldn't it make sense to backport clone3 to these older kernels, so
> that further enhancements are possible, in cooperation with the kernel.

For a kernel standpoint sure, for libc one it only make sense if it becomes
de-facto kernel ABI. It can be quite feasible from a distribution standpoint,
where it controls both kernel and userland deployment. But it is not the only
scenario glibc aims to work neither we should prioritize it.

> 
> I'm all for supporting older kernels, but performance optimizations for
> old kernels, merely to work around missing system call/flag support for
> things which are straightforward to backport into the kernel seems not
> the right priority to me, sorry.

Although the Linux idea is to use everything upstream, which means to have
the latest kernel up and running all the time a new is release; this is far
from reality and it won't change in nearby feature (although I do see some
steps towards it in various projects).

> 
> If things can be fixed in the kernel, fix it there.  That applies to
> both performance and functionality bugs.

I do agree with you, but this is mostly an engineering decision where it
aims to provide an optimization to a broader audience rather than to an
specific scenario.

> Is this really important to you?  Have seen this showing up in profiles?

Not right not, but mostly because projects usually do not use posix_spawn
as the way to spawn process. But over the years we not only fixed all its
various issues but also optimize in both memory and cpu usage and also added
some extra extensions that some projects had that prevent them to use
posix_spawn instead of the old fork plus execve.

So I expect that more and more posix_spawn should be the expected way to
spawn processes on Linux (some projects are indeed using it, such as
gnome3) and by avoiding ~120 syscall each time a process is spawned 
is a nice performance improvement.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 5/5] posix: Use posix_spawn for wordexp
  2019-07-31 18:31 ` [PATCH v2 5/5] posix: Use posix_spawn for wordexp Adhemerval Zanella
  2019-08-28 14:10   ` Adhemerval Zanella
@ 2019-10-07 19:33   ` Florian Weimer
  2019-10-07 21:04     ` Carlos O'Donell
  2019-10-08 17:41     ` Adhemerval Zanella
  1 sibling, 2 replies; 59+ messages in thread
From: Florian Weimer @ 2019-10-07 19:33 UTC (permalink / raw)
  To: Adhemerval Zanella, Carlos O'Donell; +Cc: libc-alpha

* Adhemerval Zanella:

> diff --git a/posix/wordexp-test.c b/posix/wordexp-test.c
> index 10a0768a6b..ef780b0a65 100644
> --- a/posix/wordexp-test.c
> +++ b/posix/wordexp-test.c

> -/* For each fork increment the fork count.  */
> -static void
> -register_fork (void)
> -{
> -  registered_forks++;
> -}

It's a bit sad to see this testing go away.  It was originally added to
catch command execution with WRDE_NOCMD.

On Linux, could you enter a PID namespace instead and check that the
next PID has the expected value?

Carlos, you added this testing.  Do you have an opinion here?

> diff --git a/posix/wordexp.c b/posix/wordexp.c
> index 22c6d18a9c..e1aafcaceb 100644
> --- a/posix/wordexp.c
> +++ b/posix/wordexp.c

> +static char **
> +create_environment (void)
> +{
> +  size_t s = 0;
> +
> +  /* Calculate total environment size, including 'IFS' if is present.  */
> +  for (char **ep = __environ; *ep != NULL; ep++, s++);

I would put s++ into the body of the for loop, for clarity.  Or give ep
a wider scope and use s = ep -- __environ.

> +  /* Include final NULL pointer.  */
> +  char **newenviron = malloc (s * sizeof (char*));
> +  if (newenviron == NULL)
> +    return NULL;

char* should be char *.  I don't see how this includes the final NULL?

Should we do all this work only if IFS= is actually present?  That is,
skip all this for getenv ("IFS) == NULL?

> +  /* Copy current environment excluding 'IFS', to make sure the subshell
> +     doesn't field-split on our behalf. */

That comment should apply to the entire function, I think.

> @@ -884,13 +898,13 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
>    size_t maxnewlines = 0;
>    char buffer[bufsize];
>    pid_t pid;
> -  int noexec = 0;
> +  bool noexec = false;
>  
>    /* Do nothing if command substitution should not succeed.  */
>    if (flags & WRDE_NOCMD)
>      return WRDE_CMDSUB;
>  
> -  /* Don't fork() unless necessary */
> +  /* Don't posix_spawn() unless necessary */

GNU style doesn't use () after function names.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-10-07 18:35                   ` Adhemerval Zanella
  2019-10-07 18:40                     ` Florian Weimer
@ 2019-10-07 21:00                     ` Joseph Myers
  1 sibling, 0 replies; 59+ messages in thread
From: Joseph Myers @ 2019-10-07 21:00 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: Christian Brauner, Florian Weimer, libc-alpha

On Mon, 7 Oct 2019, Adhemerval Zanella wrote:

> However, glibc supports older kernels as old as v3.2 and it will take
> some years and releases to make v5.3 or new the minimum support kernel.
> And I think it would be nice to have this optimization even for older
> kernels.

I should note there are two reasons I haven't proposed an increase from 
3.2 to a more recent minimum kernel version, given that 3.2 is no longer a 
maintained Linux kernel release series.

1. An increase to 3.16, the current oldest longterm kernel at 
<https://www.kernel.org/category/releases.html>, doesn't really allow much 
in the way of cleanups.  The next minimum kernel version that would allow 
many cleanups is 4.4 (most separate socket syscalls available on all 
socketcall architectures, so most socketcall support can be removed) - I 
think that's the next update that brings enough benefits to be worthwhile.  
(And after that, the next such update would be to require 64-bit time 
syscalls on 32-bit architectures once all older kernel series stop being 
maintained - in 2024 based on the dates on that page.)

2. I was hoping that before we next increase the minimum kernel version we 
can have the changes Carlos was proposing to stop giving an error at 
startup for a too-old kernel, instead allowing code to run with syscalls 
possibly failing, in order to work better in the case of (new userspace in 
a container running under) older kernel version numbers with features 
possibly backported or no relevant new features actually required by 
glibc.

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 5/5] posix: Use posix_spawn for wordexp
  2019-10-07 19:33   ` Florian Weimer
@ 2019-10-07 21:04     ` Carlos O'Donell
  2019-10-08  9:58       ` Florian Weimer
  2019-10-08 17:41     ` Adhemerval Zanella
  1 sibling, 1 reply; 59+ messages in thread
From: Carlos O'Donell @ 2019-10-07 21:04 UTC (permalink / raw)
  To: Florian Weimer, Adhemerval Zanella; +Cc: libc-alpha

On 10/7/19 3:33 PM, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> diff --git a/posix/wordexp-test.c b/posix/wordexp-test.c
>> index 10a0768a6b..ef780b0a65 100644
>> --- a/posix/wordexp-test.c
>> +++ b/posix/wordexp-test.c
> 
>> -/* For each fork increment the fork count.  */
>> -static void
>> -register_fork (void)
>> -{
>> -  registered_forks++;
>> -}
> 
> It's a bit sad to see this testing go away.  It was originally added to
> catch command execution with WRDE_NOCMD.
> 
> On Linux, could you enter a PID namespace instead and check that the
> next PID has the expected value?
> 
> Carlos, you added this testing.  Do you have an opinion here?

We should not regress testing WRDE_NOCMD, because doing so is what
lead to CVE-2014-7817 :-(

We should expend some effort here to provide robust testing for 
WRDE_NOCMD.

All 3 tests I added rely on registered_forks testing to verify
correct operation of WRDE_NOCMD.

Is there anything we can do about this Adhemerval?

-- 
Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-10-07 18:32                   ` Florian Weimer
@ 2019-10-07 21:08                     ` Christian Brauner
  0 siblings, 0 replies; 59+ messages in thread
From: Christian Brauner @ 2019-10-07 21:08 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Adhemerval Zanella, libc-alpha

On Mon, Oct 07, 2019 at 08:32:52PM +0200, Florian Weimer wrote:
> * Christian Brauner:
> 
> >> >>>>> Maybe a CLONE_RESET_SIGNALS where the cloned process sets its signal
> >> >>>>> disposition to default SIG_IGN/SIG_DFL values may help us here.  However
> >> >>>>> afaik clone now is out of space on 'flags' for newer ones (it already
> >> >>>>> defines 24 flags plus it reserve 8 bits for signal to be sent at process
> >> >>>>> exit) and it would take time to use this feature on glibc.
> >> >>>>
> >> >>>> Christian Brauner has been working on fixing this.
> >> >>>
> >> >>> Which strategy he is proposing? Even with proper kernel support, it would
> >> >>> take time to enable glibc to use it.
> >> >>
> >> >> Lots of flag arguments, with the reset of the arguments located
> >> >> indirectly via a pointer argument.
> >> >>
> >> >> For a pure optimization, I think it's not too bad to require kernel
> >> >> backports of system calls.
> >
> > So I just accidently caught wind of this discussion. :)
> 
> Good. 8-)
> 
> > I'm open to extending clone3() to support something like the above.
> > My new clone3() version has been released with Linux 5.3. It takes a
> > struct clone_args. The structure is versioned by size and thus - in
> > theory - extensible indefinitely.
> 
> Christian, would you be able to implement the CLONE_RESET_SIGNALS flag
> for us?  It should reset any handler which is not SIG_IGN or SIG_DFL to
> SIG_DFL.  We'd also need a way to probe that the flag is supported, so
> that we can fall back to the current way of doing things otherwise.

Yeah, I can implement this. It shouldn't be too much work and if I can
Cc you and point out that you'd need/want this feature I can send a PR
for my thread updates for the 5.5 merge window.
Would that work for you?

Thanks!
Christian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 5/5] posix: Use posix_spawn for wordexp
  2019-10-07 21:04     ` Carlos O'Donell
@ 2019-10-08  9:58       ` Florian Weimer
  0 siblings, 0 replies; 59+ messages in thread
From: Florian Weimer @ 2019-10-08  9:58 UTC (permalink / raw)
  To: Carlos O'Donell; +Cc: Adhemerval Zanella, libc-alpha

* Carlos O'Donell:

> On 10/7/19 3:33 PM, Florian Weimer wrote:
>> * Adhemerval Zanella:
>> 
>>> diff --git a/posix/wordexp-test.c b/posix/wordexp-test.c
>>> index 10a0768a6b..ef780b0a65 100644
>>> --- a/posix/wordexp-test.c
>>> +++ b/posix/wordexp-test.c
>> 
>>> -/* For each fork increment the fork count.  */
>>> -static void
>>> -register_fork (void)
>>> -{
>>> -  registered_forks++;
>>> -}
>> 
>> It's a bit sad to see this testing go away.  It was originally added to
>> catch command execution with WRDE_NOCMD.
>> 
>> On Linux, could you enter a PID namespace instead and check that the
>> next PID has the expected value?
>> 
>> Carlos, you added this testing.  Do you have an opinion here?
>
> We should not regress testing WRDE_NOCMD, because doing so is what
> lead to CVE-2014-7817 :-(
>
> We should expend some effort here to provide robust testing for 
> WRDE_NOCMD.

I'm working on it.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-10-07 18:29         ` Florian Weimer
@ 2019-10-08 17:38           ` Adhemerval Zanella
  2019-10-08 18:52             ` Florian Weimer
  2020-11-02 19:51             ` Joseph Myers
  0 siblings, 2 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-10-08 17:38 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha



On 07/10/2019 15:29, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> +#define copy_field(dst, src)			\
>> +  ({						\
>> +     typeof (src) _src;				\
>> +     memcpy (&_src, &(src), sizeof (src));	\
>> +     typeof (dst) _dst = _src;			\
>> +     memcpy (&(dst), &_dst, sizeof (dst));	\
>> +  })
>> +
>> +      copy_field (dp->d_ino, kdp->d_ino);
>> +      copy_field (dp->d_off, kdp->d_off);
>> +      copy_field (last_offset, kdp->d_off);
>> +      copy_field (dp->d_reclen, new_reclen);
>>        dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
> 
> I believe this still asserts the dynamic type of *dp, which is not what
> we want.  The truly portable way probably involves using offsetof and
> not -> dereferencing. 8-(
> 
> Considering that, I would probably drop copy_field, compile the file
> with -fno-strict-aliasing, and add a comment to the (now plain
> assignments) that this is okay due to -fno-strict-aliasing.
> 
> But this is really up to you, I do not want to discuss this patch to
> death.
> 

I chatted with Rich Felker yesterday and we couldn't be sure that
'&((T*)buf)->member' is indeed UB.  In any case I changed to a portable
way now and I think it should be ok to push.

--

diff --git a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
index a8c65cccbf..905239cad9 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
+++ b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
@@ -22,88 +22,113 @@
 #include <assert.h>
 #include <sys/param.h>
 #include <unistd.h>
-#include <scratch_buffer.h>
 #include <limits.h>
 
+#include <include/libc-pointer-arith.h>
+
 ssize_t
-__getdents64 (int fd, void *buf0, size_t nbytes)
+__getdents64 (int fd, void *buf, size_t nbytes)
 {
-  char *buf = buf0;
-
   /* The system call takes an unsigned int argument, and some length
      checks in the kernel use an int type.  */
   if (nbytes > INT_MAX)
     nbytes = INT_MAX;
 
 #ifdef __NR_getdents64
-  ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
-  if (ret != -1)
-    return ret;
+  static int getdents64_supported = true;
+  if (atomic_load_relaxed (&getdents64_supported))
+    {
+      ssize_t ret = INLINE_SYSCALL_CALL (getdents64, fd, buf, nbytes);
+      if (ret >= 0 || errno != ENOSYS)
+	return ret;
+
+      atomic_store_relaxed (&getdents64_supported, false);
+    }
 #endif
 
   /* Unfortunately getdents64 was only wire-up for MIPS n64 on Linux 3.10.
-     If syscall is not available it need to fallback to non-LFS one.  */
+     If the syscall is not available it need to fallback to the non-LFS one.
+     Also to avoid an unbounded allocation through VLA/alloca or malloc (which
+     would make the syscall non async-signal-safe) it uses a limited buffer.
+     This is sub-optimal for large NBYTES, however this is a fallback
+     mechanism to emulate a syscall that kernel should provide.   */
 
   struct kernel_dirent
-    {
-      unsigned long d_ino;
-      unsigned long d_off;
-      unsigned short int d_reclen;
-      char d_name[256];
-    };
+  {
+#if _MIPS_SIM == _ABI64
+    uint64_t d_ino;
+    uint64_t d_off;
+#else
+    uint32_t d_ino;
+    uint32_t d_off;
+#endif
+    unsigned short int d_reclen;
+    char d_name[1];
+  };
+
+  /* The largest possible practical length of the d_name member are 255
+     Unicode characters in UTF-8 encoding, so d_name is 766 bytes long, plus
+     18 (mips64) / 10 (mips64n32) bytes from header, for total of 784 (mips64)
+     / 776 (mips64n32) bytes total.  Ensure that the minimum size holds at
+     least one entry.  */
+  enum { KBUF_SIZE = 1024 };
+  char kbuf[KBUF_SIZE];
+  size_t kbuf_size = nbytes < KBUF_SIZE ? nbytes : KBUF_SIZE;
 
   const size_t size_diff = (offsetof (struct dirent64, d_name)
-			   - offsetof (struct kernel_dirent, d_name));
+                           - offsetof (struct kernel_dirent, d_name));
 
-  size_t red_nbytes = MIN (nbytes
-			   - ((nbytes / (offsetof (struct dirent64, d_name)
-					 + 14)) * size_diff),
-			   nbytes - size_diff);
+  struct dirent64 *dp = (struct dirent64 *) buf;
 
-  struct scratch_buffer tmpbuf;
-  scratch_buffer_init (&tmpbuf);
-  if (!scratch_buffer_set_array_size (&tmpbuf, red_nbytes, sizeof (uint8_t)))
-    INLINE_SYSCALL_ERROR_RETURN_VALUE (ENOMEM);
+  size_t nb = 0;
+  off64_t last_offset = -1;
 
-  struct kernel_dirent *skdp, *kdp;
-  skdp = kdp = tmpbuf.data;
+  ssize_t r = INLINE_SYSCALL_CALL (getdents, fd, kbuf, kbuf_size);
+  if (r <= 0)
+    return r;
 
-  ssize_t retval = INLINE_SYSCALL_CALL (getdents, fd, kdp, red_nbytes);
-  if (retval == -1)
-    {
-      scratch_buffer_free (&tmpbuf);
-      return -1;
-    }
+  struct kernel_dirent *skdp, *kdp;
+  skdp = kdp = (struct kernel_dirent *) kbuf;
 
-  off64_t last_offset = -1;
-  struct dirent64 *dp = (struct dirent64 *) buf;
-  while ((char *) kdp < (char *) skdp + retval)
+  while ((char *) kdp < (char *) skdp + r)
     {
-      const size_t alignment = _Alignof (struct dirent64);
-      /* Since kdp->d_reclen is already aligned for the kernel structure
-	 this may compute a value that is bigger than necessary.  */
-      size_t new_reclen = ((kdp->d_reclen + size_diff + alignment - 1)
-			   & ~(alignment - 1));
-      if ((char *) dp + new_reclen > buf + nbytes)
-        {
-	  /* Our heuristic failed.  We read too many entries.  Reset
-	     the stream.  */
-	  assert (last_offset != -1);
-	  __lseek64 (fd, last_offset, SEEK_SET);
-
-	  if ((char *) dp == buf)
+      /* This is a conservative approximation, since some of size_diff might
+	 fit into the existing padding for alignment.  */
+      size_t new_reclen = ALIGN_UP (kdp->d_reclen + size_diff,
+				    _Alignof (struct dirent64));
+      if (nb + new_reclen > nbytes)
+	{
+	  /* Entry is too large for the fixed-size buffer.  */
+	  if (last_offset == -1)
 	    {
-	      scratch_buffer_free (&tmpbuf);
-	      return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
+	      __set_errno (EINVAL);
+	      return -1;
 	    }
 
-	  break;
+	  /* The new entry will overflow the input buffer, rewind to last
+	     obtained entry and return.  */
+	  __lseek64 (fd, last_offset, SEEK_SET);
+	  return (char *) dp - (char *) buf;
 	}
-
-      last_offset = kdp->d_off;
-      dp->d_ino = kdp->d_ino;
-      dp->d_off = kdp->d_off;
-      dp->d_reclen = new_reclen;
+      nb += new_reclen;
+
+#define DP_MEMBER(src, type, member)			     \
+    (__typeof__((type){0}.member) *)			     \
+      memcpy (&((__typeof__((type){0}.member)){0}),          \
+	      ((char *)(src) + offsetof (type, member)),     \
+	      sizeof ((type){0}.member))
+
+      memcpy (((char *)(dp) + offsetof (struct dirent64, d_ino)),
+	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),
+	      sizeof ((struct dirent64){0}.d_ino));
+      memcpy (((char *)(dp) + offsetof (struct dirent64, d_off)),
+	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),
+	      sizeof ((struct dirent64){0}.d_ino));
+      memcpy (&last_offset,
+	      DP_MEMBER (kdp, struct kernel_dirent, d_off),
+	      sizeof (last_offset));
+      memcpy (DP_MEMBER (dp, struct dirent64, d_reclen), &new_reclen,
+	      sizeof ((struct dirent64){0}.d_reclen));
       dp->d_type = *((char *) kdp + kdp->d_reclen - 1);
       memcpy (dp->d_name, kdp->d_name,
 	      kdp->d_reclen - offsetof (struct kernel_dirent, d_name));
@@ -112,8 +137,7 @@ __getdents64 (int fd, void *buf0, size_t nbytes)
       kdp = (struct kernel_dirent *) (((char *) kdp) + kdp->d_reclen);
     }
 
-  scratch_buffer_free (&tmpbuf);
-  return (char *) dp - buf;
+  return (char *) dp - (char *) buf;
 }
 libc_hidden_def (__getdents64)
 weak_alias (__getdents64, getdents64) 

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 5/5] posix: Use posix_spawn for wordexp
  2019-10-07 19:33   ` Florian Weimer
  2019-10-07 21:04     ` Carlos O'Donell
@ 2019-10-08 17:41     ` Adhemerval Zanella
  2019-10-09  9:11       ` Florian Weimer
  1 sibling, 1 reply; 59+ messages in thread
From: Adhemerval Zanella @ 2019-10-08 17:41 UTC (permalink / raw)
  To: Florian Weimer, Carlos O'Donell; +Cc: libc-alpha



On 07/10/2019 16:33, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> diff --git a/posix/wordexp-test.c b/posix/wordexp-test.c
>> index 10a0768a6b..ef780b0a65 100644
>> --- a/posix/wordexp-test.c
>> +++ b/posix/wordexp-test.c
> 
>> -/* For each fork increment the fork count.  */
>> -static void
>> -register_fork (void)
>> -{
>> -  registered_forks++;
>> -}
> 
> It's a bit sad to see this testing go away.  It was originally added to
> catch command execution with WRDE_NOCMD.
> 
> On Linux, could you enter a PID namespace instead and check that the
> next PID has the expected value?
> 
> Carlos, you added this testing.  Do you have an opinion here?
> 
>> diff --git a/posix/wordexp.c b/posix/wordexp.c
>> index 22c6d18a9c..e1aafcaceb 100644
>> --- a/posix/wordexp.c
>> +++ b/posix/wordexp.c
> 
>> +static char **
>> +create_environment (void)
>> +{
>> +  size_t s = 0;
>> +
>> +  /* Calculate total environment size, including 'IFS' if is present.  */
>> +  for (char **ep = __environ; *ep != NULL; ep++, s++);
> 
> I would put s++ into the body of the for loop, for clarity.  Or give ep
> a wider scope and use s = ep -- __environ.

Ack, I moved s++ to main loop.

> 
>> +  /* Include final NULL pointer.  */
>> +  char **newenviron = malloc (s * sizeof (char*));
>> +  if (newenviron == NULL)
>> +    return NULL;
> 
> char* should be char *.  I don't see how this includes the final NULL?
> 
> Should we do all this work only if IFS= is actually present?  That is,
> skip all this for getenv ("IFS) == NULL?

Ack.

> 
>> +  /* Copy current environment excluding 'IFS', to make sure the subshell
>> +     doesn't field-split on our behalf. */
> 
> That comment should apply to the entire function, I think.

Ack.

> 
>> @@ -884,13 +898,13 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
>>    size_t maxnewlines = 0;
>>    char buffer[bufsize];
>>    pid_t pid;
>> -  int noexec = 0;
>> +  bool noexec = false;
>>  
>>    /* Do nothing if command substitution should not succeed.  */
>>    if (flags & WRDE_NOCMD)
>>      return WRDE_CMDSUB;
>>  
>> -  /* Don't fork() unless necessary */
>> +  /* Don't posix_spawn() unless necessary */
> 
> GNU style doesn't use () after function names.

Ack.

I fixed your remarks, rebased against master to pick-up your changed to 
wordexp tests and used dynarray to construct the new environment (it 
simplifies a bit the creation on a new environment for the IFS case).

	* include/spawn.h (__posix_spawn_file_actions_addopen): New
	prototype.
	* posix/spawn_faction_addopen.c (posix_spawn_file_actions_addopen):
	Add internal alias.
	* posix/wordexp.c (create_environment, free_environment): New
	functions.
	(exec_comm_child, exec_comm): Use posix_spawn instead of fork+exec.
	* posix/wordexp-test.c: Use libsupport and remove atfork usage.
---
diff --git a/include/spawn.h b/include/spawn.h
index 7fdd965bd7..4a0b1849da 100644
--- a/include/spawn.h
+++ b/include/spawn.h
@@ -11,6 +11,9 @@ __typeof (posix_spawn_file_actions_addclose)
 __typeof (posix_spawn_file_actions_adddup2)
   __posix_spawn_file_actions_adddup2 attribute_hidden;
 
+__typeof (posix_spawn_file_actions_addopen)
+  __posix_spawn_file_actions_addopen attribute_hidden;
+
 __typeof (posix_spawn_file_actions_destroy)
   __posix_spawn_file_actions_destroy attribute_hidden;
 
diff --git a/posix/spawn_faction_addopen.c b/posix/spawn_faction_addopen.c
index d5694ee4d7..4fd64bb005 100644
--- a/posix/spawn_faction_addopen.c
+++ b/posix/spawn_faction_addopen.c
@@ -25,9 +25,9 @@
 /* Add an action to FILE-ACTIONS which tells the implementation to call
    `open' for the given file during the `spawn' call.  */
 int
-posix_spawn_file_actions_addopen (posix_spawn_file_actions_t *file_actions,
-				  int fd, const char *path, int oflag,
-				  mode_t mode)
+__posix_spawn_file_actions_addopen (posix_spawn_file_actions_t *file_actions,
+				    int fd, const char *path, int oflag,
+				    mode_t mode)
 {
   struct __spawn_action *rec;
 
@@ -60,3 +60,5 @@ posix_spawn_file_actions_addopen (posix_spawn_file_actions_t *file_actions,
 
   return 0;
 }
+weak_alias (__posix_spawn_file_actions_addopen,
+	    posix_spawn_file_actions_addopen)
diff --git a/posix/wordexp-test.c b/posix/wordexp-test.c
index a4d8bcf1da..34836f6240 100644
--- a/posix/wordexp-test.c
+++ b/posix/wordexp-test.c
@@ -15,19 +15,18 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sys/stat.h>
-#include <sys/types.h>
-#include <sys/mman.h>
+#include <wordexp.h>
+#include <stdio.h>
 #include <fcntl.h>
-#include <unistd.h>
 #include <pwd.h>
-#include <stdio.h>
-#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
-#include <wordexp.h>
+#include <sys/mman.h>
+
 #include <libc-pointer-arith.h>
-#include <dso_handle.h>
+#include <array_length.h>
+#include <support/xunistd.h>
+#include <support/check.h>
 
 #define IFS " \n\t"
 
@@ -40,7 +39,7 @@ struct test_case_struct
   size_t wordc;
   const char *wordv[10];
   const char *ifs;
-} test_case[] =
+} static test_case[] =
   {
     /* Simple word- and field-splitting */
     { 0, NULL, "one", 0, 1, { "one", }, IFS },
@@ -213,8 +212,6 @@ struct test_case_struct
     { WRDE_SYNTAX, NULL, "`\\", 0, 0, { NULL, }, IFS },     /* BZ 18042  */
     { WRDE_SYNTAX, NULL, "${", 0, 0, { NULL, }, IFS },      /* BZ 18043  */
     { WRDE_SYNTAX, NULL, "L${a:", 0, 0, { NULL, }, IFS },   /* BZ 18043#c4  */
-
-    { -1, NULL, NULL, 0, 0, { NULL, }, IFS },
   };
 
 static int testit (struct test_case_struct *tc);
@@ -226,21 +223,19 @@ command_line_test (const char *words)
   wordexp_t we;
   int i;
   int retval = wordexp (words, &we, 0);
-  printf ("wordexp returned %d\n", retval);
+  printf ("info: wordexp returned %d\n", retval);
   for (i = 0; i < we.we_wordc; i++)
-    printf ("we_wordv[%d] = \"%s\"\n", i, we.we_wordv[i]);
+    printf ("info: we_wordv[%d] = \"%s\"\n", i, we.we_wordv[i]);
 }
 
-int
-main (int argc, char *argv[])
+static int
+do_test (int argc, char *argv[])
 {
-  const char *globfile[] = { "one", "two", "three", NULL };
+  const char *globfile[] = { "one", "two", "three" };
   char tmpdir[32];
   struct passwd *pw;
   const char *cwd;
   int test;
-  int fail = 0;
-  int i;
   struct test_case_struct ts;
 
   if (argc > 1)
@@ -253,21 +248,18 @@ main (int argc, char *argv[])
 
   /* Set up arena for pathname expansion */
   tmpnam (tmpdir);
-  if (mkdir (tmpdir, S_IRWXU) || chdir (tmpdir))
-    return -1;
-  else
-    {
-      int fd;
+  xmkdir (tmpdir, S_IRWXU);
+  TEST_VERIFY_EXIT (chdir (tmpdir) == 0);
 
-      for (i = 0; globfile[i]; ++i)
-	if ((fd = creat (globfile[i], S_IRUSR | S_IWUSR)) == -1
-	    || close (fd))
-	  return -1;
+  for (int i = 0; i < array_length (globfile); ++i)
+    {
+      int fd = xopen (globfile[i], O_WRONLY|O_CREAT|O_TRUNC,
+		      S_IRUSR | S_IWUSR);
+      xclose (fd);
     }
 
-  for (test = 0; test_case[test].retval != -1; test++)
-    if (testit (&test_case[test]))
-      ++fail;
+  for (test = 0; test < array_length (test_case); test++)
+    TEST_COMPARE (testit (&test_case[test]), 0);
 
   /* Tilde-expansion tests. */
   pw = getpwnam ("root");
@@ -281,8 +273,7 @@ main (int argc, char *argv[])
       ts.wordv[0] = pw->pw_dir;
       ts.ifs = IFS;
 
-      if (testit (&ts))
-	++fail;
+      TEST_COMPARE (testit (&ts), 0);
 
       ts.retval = 0;
       ts.env = pw->pw_dir;
@@ -292,8 +283,7 @@ main (int argc, char *argv[])
       ts.wordv[0] = "x";
       ts.ifs = IFS;
 
-      if (testit (&ts))
-	++fail;
+      TEST_COMPARE (testit (&ts), 0);
     }
 
   /* "~" expands to value of $HOME when HOME is set */
@@ -308,8 +298,7 @@ main (int argc, char *argv[])
   ts.wordv[1] = "/dummy/home/foo";
   ts.ifs = IFS;
 
-  if (testit (&ts))
-    ++fail;
+  TEST_COMPARE (testit (&ts), 0);
 
   /* "~" expands to home dir from passwd file if HOME is not set */
 
@@ -325,14 +314,13 @@ main (int argc, char *argv[])
       ts.wordv[0] = pw->pw_dir;
       ts.ifs = IFS;
 
-      if (testit (&ts))
-	++fail;
+      TEST_COMPARE (testit (&ts), 0);
     }
 
   puts ("tests completed, now cleaning up");
 
   /* Clean up */
-  for (i = 0; globfile[i]; ++i)
+  for (int i = 0; i < array_length (globfile); ++i)
     remove (globfile[i]);
 
   if (cwd == NULL)
@@ -341,26 +329,17 @@ main (int argc, char *argv[])
   chdir (cwd);
   rmdir (tmpdir);
 
-  printf ("tests failed: %d\n", fail);
-
-  return fail != 0;
+  return 0;
 }
 
 static const char *
 at_page_end (const char *words)
 {
   const int pagesize = getpagesize ();
-  char *start = mmap (0, 2 * pagesize, PROT_READ|PROT_WRITE,
-		      MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+  char *start = xmmap (0, 2 * pagesize, PROT_READ | PROT_WRITE,
+		       MAP_PRIVATE | MAP_ANONYMOUS, -1);
 
-  if (start == MAP_FAILED)
-    return start;
-
-  if (mprotect (start + pagesize, pagesize, PROT_NONE))
-    {
-      munmap (start, 2 * pagesize);
-      return MAP_FAILED;
-    }
+  xmprotect (start + pagesize, pagesize, PROT_NONE);
 
   /* Includes terminating NUL.  */
   const size_t words_size = strlen (words) + 1;
@@ -395,7 +374,7 @@ testit (struct test_case_struct *tc)
   sav_we.we_offs = 3;
   we = sav_we;
 
-  printf ("Test %d (%s): ", ++tests, tc->words);
+  printf ("info: test %d (%s): ", ++tests, tc->words);
   fflush (NULL);
   const char *words = at_page_end (tc->words);
 
@@ -404,7 +383,7 @@ testit (struct test_case_struct *tc)
       /* initial wordexp() call, to be appended to */
       if (wordexp ("pre1 pre2", &we, tc->flags & ~WRDE_APPEND) != 0)
         {
-	  printf ("FAILED setup\n");
+	  printf ("info: FAILED setup\n");
 	  return 1;
 	}
     }
@@ -436,7 +415,7 @@ testit (struct test_case_struct *tc)
   if (bzzzt)
     {
       printf ("FAILED\n");
-      printf ("Test words: <%s>, need retval %d, wordc %Zd\n",
+      printf ("info: Test words: <%s>, need retval %d, wordc %Zd\n",
 	      tc->words, tc->retval, tc->wordc);
       if (start_offs != 0)
 	printf ("(preceded by %d NULLs)\n", start_offs);
@@ -468,9 +447,11 @@ testit (struct test_case_struct *tc)
   const int page_size = getpagesize ();
   char *start = (char *) PTR_ALIGN_DOWN (words, page_size);
 
-  if (munmap (start, 2 * page_size) != 0)
-    return 1;
+  xmunmap (start, 2 * page_size);
 
   fflush (NULL);
   return bzzzt;
 }
+
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>
diff --git a/posix/wordexp.c b/posix/wordexp.c
index 6a6e3a8e11..ea1ada1d2a 100644
--- a/posix/wordexp.c
+++ b/posix/wordexp.c
@@ -25,33 +25,18 @@
 #include <libintl.h>
 #include <paths.h>
 #include <pwd.h>
-#include <signal.h>
 #include <stdbool.h>
 #include <stdio.h>
-#include <stdlib.h>
 #include <string.h>
 #include <sys/param.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <sys/types.h>
-#include <sys/types.h>
 #include <sys/wait.h>
 #include <unistd.h>
-#include <wchar.h>
 #include <wordexp.h>
-#include <kernel-features.h>
+#include <spawn.h>
 #include <scratch_buffer.h>
-
-#include <libc-lock.h>
 #include <_itoa.h>
-
-/* Undefine the following line for the production version.  */
-/* #define NDEBUG 1 */
 #include <assert.h>
 
-/* Get some device information.  */
-#include <device-nrs.h>
-
 /*
  * This is a recursive-descent-style word expansion routine.
  */
@@ -812,61 +797,76 @@ parse_arith (char **word, size_t *word_length, size_t *max_length,
   return WRDE_SYNTAX;
 }
 
+#define DYNARRAY_STRUCT        strlist
+#define DYNARRAY_ELEMENT       char *
+#define DYNARRAY_PREFIX        strlist_
+/* Allocates about 512/1024 (32/64 bit) on stack.  */
+#define DYNARRAY_INITIAL_SIZE  128
+#include <malloc/dynarray-skeleton.c>
+
 /* Function called by child process in exec_comm() */
-static inline void
-__attribute__ ((always_inline))
-exec_comm_child (char *comm, int *fildes, int showerr, int noexec)
+static pid_t
+exec_comm_child (char *comm, int *fildes, bool showerr, bool noexec)
 {
-  const char *args[4] = { _PATH_BSHELL, "-c", comm, NULL };
+  pid_t pid = -1;
 
-  /* Execute the command, or just check syntax? */
-  if (noexec)
-    args[1] = "-nc";
+  /* Execute the command, or just check syntax?  */
+  const char *args[] = { _PATH_BSHELL, noexec ? "-nc" : "-c", comm, NULL };
+
+  posix_spawn_file_actions_t fa;
+  /* posix_spawn_file_actions_init does not fail.  */
+  __posix_spawn_file_actions_init (&fa);
 
-  /* Redirect output.  */
-  if (__glibc_likely (fildes[1] != STDOUT_FILENO))
+  /* Redirect output.  For check syntax only (noexec being true), exec_comm
+     explicits sets fildes[1] to -1, so check its value to avoid a failure in
+     __posix_spawn_file_actions_adddup2.  */
+  if (fildes[1] != -1)
     {
-      __dup2 (fildes[1], STDOUT_FILENO);
-      __close (fildes[1]);
+      if (__glibc_likely (fildes[1] != STDOUT_FILENO))
+	{
+	  if (__posix_spawn_file_actions_adddup2 (&fa, fildes[1],
+						  STDOUT_FILENO) != 0
+	      || __posix_spawn_file_actions_addclose (&fa, fildes[1]) != 0)
+	    goto out;
+	}
+      else
+	/* Reset the close-on-exec flag (if necessary).  */
+	if (__posix_spawn_file_actions_adddup2 (&fa, fildes[1], fildes[1])
+	    != 0)
+	  goto out;
     }
-  else
-    /* Reset the close-on-exec flag (if necessary).  */
-    __fcntl (fildes[1], F_SETFD, 0);
 
   /* Redirect stderr to /dev/null if we have to.  */
-  if (showerr == 0)
+  if (!showerr)
+    if (__posix_spawn_file_actions_addopen (&fa, STDERR_FILENO, _PATH_DEVNULL,
+					    O_WRONLY, 0) != 0)
+      goto out;
+
+  struct strlist newenv;
+  strlist_init (&newenv);
+
+  bool recreate_env = getenv ("IFS") != NULL;
+  if (recreate_env)
     {
-      struct stat64 st;
-      int fd;
-      __close (STDERR_FILENO);
-      fd = __open (_PATH_DEVNULL, O_WRONLY);
-      if (fd >= 0 && fd != STDERR_FILENO)
-	{
-	  __dup2 (fd, STDERR_FILENO);
-	  __close (fd);
-	}
-      /* Be paranoid.  Check that we actually opened the /dev/null
-	 device.  */
-      if (__builtin_expect (__fxstat64 (_STAT_VER, STDERR_FILENO, &st), 0) != 0
-	  || __builtin_expect (S_ISCHR (st.st_mode), 1) == 0
-#if defined DEV_NULL_MAJOR && defined DEV_NULL_MINOR
-	  || st.st_rdev != __gnu_dev_makedev (DEV_NULL_MAJOR, DEV_NULL_MINOR)
-#endif
-	  )
-	/* It's not the /dev/null device.  Stop right here.  The
-	   problem is: how do we stop?  We use _exit() with an
-	   hopefully unusual exit code.  */
-	_exit (90);
+      for (char **ep = __environ; *ep != NULL; ep++)
+	if (strncmp (*ep, "IFS=", sizeof ("IFS=")-1) != 0)
+	  strlist_add (&newenv, *ep);
+      strlist_add (&newenv, NULL);
+      if (strlist_has_failed (&newenv))
+	goto out;
     }
 
-  /* Make sure the subshell doesn't field-split on our behalf. */
-  __unsetenv ("IFS");
+  /* pid is unset if posix_spawn fails, so it keep the original value
+     of -1.  */
+  __posix_spawn (&pid, _PATH_BSHELL, &fa, NULL, (char *const *) args,
+		 recreate_env ? strlist_begin (&newenv) : __environ);
 
-  __close (fildes[0]);
-  __execve (_PATH_BSHELL, (char *const *) args, __environ);
+  strlist_free (&newenv);
+
+out:
+  __posix_spawn_file_actions_destroy (&fa);
 
-  /* Bad.  What now?  */
-  abort ();
+  return pid;
 }
 
 /* Function to execute a command and retrieve the results */
@@ -884,13 +884,13 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
   size_t maxnewlines = 0;
   char buffer[bufsize];
   pid_t pid;
-  int noexec = 0;
+  bool noexec = false;
 
   /* Do nothing if command substitution should not succeed.  */
   if (flags & WRDE_NOCMD)
     return WRDE_CMDSUB;
 
-  /* Don't fork() unless necessary */
+  /* Don't posix_spawn unless necessary */
   if (!comm || !*comm)
     return 0;
 
@@ -898,19 +898,15 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
     return WRDE_NOSPACE;
 
  again:
-  if ((pid = __fork ()) < 0)
+  pid = exec_comm_child (comm, fildes, noexec ? false : flags & WRDE_SHOWERR,
+			 noexec);
+  if (pid < 0)
     {
-      /* Bad */
       __close (fildes[0]);
       __close (fildes[1]);
       return WRDE_NOSPACE;
     }
 
-  if (pid == 0)
-    exec_comm_child (comm, fildes, noexec ? 0 : flags & WRDE_SHOWERR, noexec);
-
-  /* Parent */
-
   /* If we are just testing the syntax, only wait.  */
   if (noexec)
     return (TEMP_FAILURE_RETRY (__waitpid (pid, &status, 0)) == pid
@@ -1091,7 +1087,7 @@ exec_comm (char *comm, char **word, size_t *word_length, size_t *max_length,
   /* Check for syntax error (re-execute but with "-n" flag) */
   if (buflen < 1 && status != 0)
     {
-      noexec = 1;
+      noexec = true;
       goto again;
     }
 
@@ -1143,26 +1139,9 @@ parse_comm (char **word, size_t *word_length, size_t *max_length,
 	      /* Go -- give script to the shell */
 	      if (comm)
 		{
-#ifdef __libc_ptf_call
-		  /* We do not want the exec_comm call to be cut short
-		     by a thread cancellation since cleanup is very
-		     ugly.  Therefore disable cancellation for
-		     now.  */
-		  // XXX Ideally we do want the thread being cancelable.
-		  // XXX If demand is there we'll change it.
-		  int state = PTHREAD_CANCEL_ENABLE;
-		  __libc_ptf_call (__pthread_setcancelstate,
-				   (PTHREAD_CANCEL_DISABLE, &state), 0);
-#endif
-
+		  /* posix_spawn already handles thread cancellation.  */
 		  error = exec_comm (comm, word, word_length, max_length,
 				     flags, pwordexp, ifs, ifs_white);
-
-#ifdef __libc_ptf_call
-		  __libc_ptf_call (__pthread_setcancelstate,
-				   (state, NULL), 0);
-#endif
-
 		  free (comm);
 		}
 

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-10-08 17:38           ` Adhemerval Zanella
@ 2019-10-08 18:52             ` Florian Weimer
  2019-10-08 19:52               ` Adhemerval Zanella
  2020-11-02 19:51             ` Joseph Myers
  1 sibling, 1 reply; 59+ messages in thread
From: Florian Weimer @ 2019-10-08 18:52 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: libc-alpha

* Adhemerval Zanella:

> +#define DP_MEMBER(src, type, member)			     \
> +    (__typeof__((type){0}.member) *)			     \
> +      memcpy (&((__typeof__((type){0}.member)){0}),          \
> +	      ((char *)(src) + offsetof (type, member)),     \
> +	      sizeof ((type){0}.member))

Please add a comment that this is used to avoid an aliasing violation.

> +      memcpy (((char *)(dp) + offsetof (struct dirent64, d_ino)),
> +	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),
> +	      sizeof ((struct dirent64){0}.d_ino));
> +      memcpy (((char *)(dp) + offsetof (struct dirent64, d_off)),
> +	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),
> +	      sizeof ((struct dirent64){0}.d_ino));
> +      memcpy (&last_offset,
> +	      DP_MEMBER (kdp, struct kernel_dirent, d_off),
> +	      sizeof (last_offset));

I think you should be able to use:

   last_offset = *DP_MEMBER (kdp, struct kernel_dirent, d_off);

last_offset has the correct type.

> +      memcpy (DP_MEMBER (dp, struct dirent64, d_reclen), &new_reclen,
> +	      sizeof ((struct dirent64){0}.d_reclen));

That looks wrong.  DP_MEMBER (dp, struct dirent64, d_reclen) is a
temporary object, so the outer memcpy is dead.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-10-08 18:52             ` Florian Weimer
@ 2019-10-08 19:52               ` Adhemerval Zanella
  2019-10-08 19:59                 ` Florian Weimer
  0 siblings, 1 reply; 59+ messages in thread
From: Adhemerval Zanella @ 2019-10-08 19:52 UTC (permalink / raw)
  To: Florian Weimer; +Cc: libc-alpha



On 08/10/2019 15:52, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> +#define DP_MEMBER(src, type, member)			     \
>> +    (__typeof__((type){0}.member) *)			     \
>> +      memcpy (&((__typeof__((type){0}.member)){0}),          \
>> +	      ((char *)(src) + offsetof (type, member)),     \
>> +	      sizeof ((type){0}.member))
> 
> Please add a comment that this is used to avoid an aliasing violation.

Ack.

> 
>> +      memcpy (((char *)(dp) + offsetof (struct dirent64, d_ino)),
>> +	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),
>> +	      sizeof ((struct dirent64){0}.d_ino));
>> +      memcpy (((char *)(dp) + offsetof (struct dirent64, d_off)),
>> +	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),
>> +	      sizeof ((struct dirent64){0}.d_ino));
>> +      memcpy (&last_offset,
>> +	      DP_MEMBER (kdp, struct kernel_dirent, d_off),
>> +	      sizeof (last_offset));
> 
> I think you should be able to use:
> 
>    last_offset = *DP_MEMBER (kdp, struct kernel_dirent, d_off);
> 
> last_offset has the correct type.

Ack.

> 
>> +      memcpy (DP_MEMBER (dp, struct dirent64, d_reclen), &new_reclen,
>> +	      sizeof ((struct dirent64){0}.d_reclen));
> 
> That looks wrong.  DP_MEMBER (dp, struct dirent64, d_reclen) is a
> temporary object, so the outer memcpy is dead.

Sigh, indeed. I changed to:

   memcpy (((char *)(dp) + offsetof (struct dirent64, d_reclen)),
           &new_reclen, sizeof ((struct dirent64){0}.d_reclen));

> 
> Thanks,
> Florian
> 

I hope I got this right this time...

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-10-08 19:52               ` Adhemerval Zanella
@ 2019-10-08 19:59                 ` Florian Weimer
  2019-10-09 13:02                   ` Adhemerval Zanella
  0 siblings, 1 reply; 59+ messages in thread
From: Florian Weimer @ 2019-10-08 19:59 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: Florian Weimer, libc-alpha

* Adhemerval Zanella:

>>> +      memcpy (DP_MEMBER (dp, struct dirent64, d_reclen), &new_reclen,
>>> +	      sizeof ((struct dirent64){0}.d_reclen));
>> 
>> That looks wrong.  DP_MEMBER (dp, struct dirent64, d_reclen) is a
>> temporary object, so the outer memcpy is dead.
>
> Sigh, indeed. I changed to:
>
>    memcpy (((char *)(dp) + offsetof (struct dirent64, d_reclen)),
>            &new_reclen, sizeof ((struct dirent64){0}.d_reclen));

sizeof ((struct dirent64){0}.d_reclen) could just be
sizeof (new_reclen).  After all, this only works if they are the same.

I guess -fno-strict-aliasing looks more attractive now. 8-/

You probably should write ((char *) dp) instead of (char *)(dp) if you
want to make the operator precedence explicit, or at least drop the
parentheses around dp.  (I think the cast binds tighter than the +,
but I can't really remember.  I tend to write the paranetheses.)

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 5/5] posix: Use posix_spawn for wordexp
  2019-10-08 17:41     ` Adhemerval Zanella
@ 2019-10-09  9:11       ` Florian Weimer
  2019-10-09 12:18         ` Adhemerval Zanella
  0 siblings, 1 reply; 59+ messages in thread
From: Florian Weimer @ 2019-10-09  9:11 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: Carlos O'Donell, libc-alpha

Thanks for the updated patch.

* Adhemerval Zanella:

>  static const char *
>  at_page_end (const char *words)
>  {
>    const int pagesize = getpagesize ();
> -  char *start = mmap (0, 2 * pagesize, PROT_READ|PROT_WRITE,
> -		      MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
> +  char *start = xmmap (0, 2 * pagesize, PROT_READ | PROT_WRITE,
> +		       MAP_PRIVATE | MAP_ANONYMOUS, -1);
>  
> -  if (start == MAP_FAILED)
> -    return start;
> -
> -  if (mprotect (start + pagesize, pagesize, PROT_NONE))
> -    {
> -      munmap (start, 2 * pagesize);
> -      return MAP_FAILED;
> -    }
> +  xmprotect (start + pagesize, pagesize, PROT_NONE);

I believe you can use <support/next_to_fault.h> for that.

> +	if (strncmp (*ep, "IFS=", sizeof ("IFS=")-1) != 0)

Missing spaces around -.  In my opinion, you should just call strlen.
GCC will fold it to a constant.

>   /* pid is unset if posix_spawn fails, so it keep the original value

“pid is not set” or “pid is not updated”, I think.

Rest looks okay to me.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-10-07 19:20                       ` Adhemerval Zanella
@ 2019-10-09  9:37                         ` Florian Weimer
  2019-10-09 10:25                           ` Christian Brauner
  2019-10-09 12:17                           ` Adhemerval Zanella
  0 siblings, 2 replies; 59+ messages in thread
From: Florian Weimer @ 2019-10-09  9:37 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: Christian Brauner, libc-alpha

* Adhemerval Zanella:

> On 07/10/2019 15:40, Florian Weimer wrote:
>> * Adhemerval Zanella:
>> 
>>> However, glibc supports older kernels as old as v3.2 and it will take
>>> some years and releases to make v5.3 or new the minimum support kernel.
>>> And I think it would be nice to have this optimization even for older
>>> kernels.
>> 
>> But wouldn't it make sense to backport clone3 to these older kernels, so
>> that further enhancements are possible, in cooperation with the kernel.
>
> For a kernel standpoint sure, for libc one it only make sense if it becomes
> de-facto kernel ABI. It can be quite feasible from a distribution standpoint,
> where it controls both kernel and userland deployment. But it is not the only
> scenario glibc aims to work neither we should prioritize it.

Sure.  But I think we should keep in mind here that this is not a
localized optimization.  It optimizes posix_spawn with something that
extends into something that is (at least superficially) completely
unrelated.

If we can get kernel assistance for the optimization (and it looks like
we'll receive it), we can avoid that complexity.  The patch Christian
posted is very small.  It's on top of clone3, sure, but I expect that
people will want the system call anyway for some container support case
soon enough, so long-term maintained kernels will get it essentially for
free.

Using the new clone3 flag looks inherently backportable to me on the
glibc side.  Compared to that, the sigaction changes look a bit risky to
me.

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-10-09  9:37                         ` Florian Weimer
@ 2019-10-09 10:25                           ` Christian Brauner
  2019-10-09 12:17                           ` Adhemerval Zanella
  1 sibling, 0 replies; 59+ messages in thread
From: Christian Brauner @ 2019-10-09 10:25 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Adhemerval Zanella, libc-alpha

On Wed, Oct 09, 2019 at 11:37:30AM +0200, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
> > On 07/10/2019 15:40, Florian Weimer wrote:
> >> * Adhemerval Zanella:
> >> 
> >>> However, glibc supports older kernels as old as v3.2 and it will take
> >>> some years and releases to make v5.3 or new the minimum support kernel.
> >>> And I think it would be nice to have this optimization even for older
> >>> kernels.
> >> 
> >> But wouldn't it make sense to backport clone3 to these older kernels, so
> >> that further enhancements are possible, in cooperation with the kernel.
> >
> > For a kernel standpoint sure, for libc one it only make sense if it becomes
> > de-facto kernel ABI. It can be quite feasible from a distribution standpoint,
> > where it controls both kernel and userland deployment. But it is not the only
> > scenario glibc aims to work neither we should prioritize it.
> 
> Sure.  But I think we should keep in mind here that this is not a
> localized optimization.  It optimizes posix_spawn with something that
> extends into something that is (at least superficially) completely
> unrelated.
> 
> If we can get kernel assistance for the optimization (and it looks like
> we'll receive it), we can avoid that complexity.  The patch Christian
> posted is very small.  It's on top of clone3, sure, but I expect that
> people will want the system call anyway for some container support case
> soon enough, so long-term maintained kernels will get it essentially for
> free.

Yeah, the time namespace patchset will introduce CLONE_NEWTIME and that
flag will only be possible with clone3() for obvious reasons. And I'm
pretty sure that a lot of database workloads will want that...

> 
> Using the new clone3 flag looks inherently backportable to me on the
> glibc side.  Compared to that, the sigaction changes look a bit risky to
> me.

In the future I would like certain sets of changes that we currently do
in a racy way in the child to be made right at process creation time.
Another flag that I has been in my mind for a long time is e.g.
pdeath_signal. There should probably be an extension to struct
clone_args that introduces a new member pdeath_signal which won't be
reset and will be delivered to the child once the parent dies.

Christian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-10-09  9:37                         ` Florian Weimer
  2019-10-09 10:25                           ` Christian Brauner
@ 2019-10-09 12:17                           ` Adhemerval Zanella
  2019-10-09 19:16                             ` Florian Weimer
  1 sibling, 1 reply; 59+ messages in thread
From: Adhemerval Zanella @ 2019-10-09 12:17 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Christian Brauner, libc-alpha



On 09/10/2019 06:37, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>> On 07/10/2019 15:40, Florian Weimer wrote:
>>> * Adhemerval Zanella:
>>>
>>>> However, glibc supports older kernels as old as v3.2 and it will take
>>>> some years and releases to make v5.3 or new the minimum support kernel.
>>>> And I think it would be nice to have this optimization even for older
>>>> kernels.
>>>
>>> But wouldn't it make sense to backport clone3 to these older kernels, so
>>> that further enhancements are possible, in cooperation with the kernel.
>>
>> For a kernel standpoint sure, for libc one it only make sense if it becomes
>> de-facto kernel ABI. It can be quite feasible from a distribution standpoint,
>> where it controls both kernel and userland deployment. But it is not the only
>> scenario glibc aims to work neither we should prioritize it.
> 
> Sure.  But I think we should keep in mind here that this is not a
> localized optimization.  It optimizes posix_spawn with something that
> extends into something that is (at least superficially) completely
> unrelated.
> 
> If we can get kernel assistance for the optimization (and it looks like
> we'll receive it), we can avoid that complexity.  The patch Christian
> posted is very small.  It's on top of clone3, sure, but I expect that
> people will want the system call anyway for some container support case
> soon enough, so long-term maintained kernels will get it essentially for
> free.
> 
> Using the new clone3 flag looks inherently backportable to me on the
> glibc side.  Compared to that, the sigaction changes look a bit risky to
> me.

But still I don't see that this being more complex to backport as being
a impeding reason to push it upstream. Even we push the minimum kernel
version higher and remove the inherent minimum kernel version check, it
would be case where the posix_spawn signals reset will still trigger in
older kernels.

I give you that eventually we might remove this optimization once we
assume a minimum kernel version, but as I said this might take some
time and I think optimizing posix_spawn (along with adding the required
extensions developers see as useful) it a way to promote it over the
fork plus execve and its deficiencies. 

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 5/5] posix: Use posix_spawn for wordexp
  2019-10-09  9:11       ` Florian Weimer
@ 2019-10-09 12:18         ` Adhemerval Zanella
  0 siblings, 0 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-10-09 12:18 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Carlos O'Donell, libc-alpha



On 09/10/2019 06:11, Florian Weimer wrote:
> Thanks for the updated patch.
> 
> * Adhemerval Zanella:
> 
>>  static const char *
>>  at_page_end (const char *words)
>>  {
>>    const int pagesize = getpagesize ();
>> -  char *start = mmap (0, 2 * pagesize, PROT_READ|PROT_WRITE,
>> -		      MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
>> +  char *start = xmmap (0, 2 * pagesize, PROT_READ | PROT_WRITE,
>> +		       MAP_PRIVATE | MAP_ANONYMOUS, -1);
>>  
>> -  if (start == MAP_FAILED)
>> -    return start;
>> -
>> -  if (mprotect (start + pagesize, pagesize, PROT_NONE))
>> -    {
>> -      munmap (start, 2 * pagesize);
>> -      return MAP_FAILED;
>> -    }
>> +  xmprotect (start + pagesize, pagesize, PROT_NONE);
> 
> I believe you can use <support/next_to_fault.h> for that.

Ack, I will change to use it.

> 
>> +	if (strncmp (*ep, "IFS=", sizeof ("IFS=")-1) != 0)
> 
> Missing spaces around -.  In my opinion, you should just call strlen.
> GCC will fold it to a constant.

Ack.

> 
>>   /* pid is unset if posix_spawn fails, so it keep the original value
> 
> “pid is not set” or “pid is not updated”, I think.

Ack.

> 
> Rest looks okay to me.
> 
> Thanks,
> Florian
> 

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-10-08 19:59                 ` Florian Weimer
@ 2019-10-09 13:02                   ` Adhemerval Zanella
  0 siblings, 0 replies; 59+ messages in thread
From: Adhemerval Zanella @ 2019-10-09 13:02 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Florian Weimer, libc-alpha



On 08/10/2019 16:59, Florian Weimer wrote:
> * Adhemerval Zanella:
> 
>>>> +      memcpy (DP_MEMBER (dp, struct dirent64, d_reclen), &new_reclen,
>>>> +	      sizeof ((struct dirent64){0}.d_reclen));
>>>
>>> That looks wrong.  DP_MEMBER (dp, struct dirent64, d_reclen) is a
>>> temporary object, so the outer memcpy is dead.
>>
>> Sigh, indeed. I changed to:
>>
>>    memcpy (((char *)(dp) + offsetof (struct dirent64, d_reclen)),
>>            &new_reclen, sizeof ((struct dirent64){0}.d_reclen));
> 
> sizeof ((struct dirent64){0}.d_reclen) could just be
> sizeof (new_reclen).  After all, this only works if they are the same.

Ack.

> 
> I guess -fno-strict-aliasing looks more attractive now. 8-/
> 
> You probably should write ((char *) dp) instead of (char *)(dp) if you
> want to make the operator precedence explicit, or at least drop the
> parentheses around dp.  (I think the cast binds tighter than the +,
> but I can't really remember.  I tend to write the paranetheses.)
> 

Ack.

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls
  2019-10-09 12:17                           ` Adhemerval Zanella
@ 2019-10-09 19:16                             ` Florian Weimer
  0 siblings, 0 replies; 59+ messages in thread
From: Florian Weimer @ 2019-10-09 19:16 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: Christian Brauner, libc-alpha

* Adhemerval Zanella:

> But still I don't see that this being more complex to backport as being
> a impeding reason to push it upstream.

Your argument was that the glibc-only optimization would be available to
users more quickly.  That's why I brought backporting up.

I still think the direction this optimization is taking quite wrong.  We
shouldn't add code to system call wrappers to collect secondary
information if we can help it.  openat had something like this, and it
went wrong with O_TMPFILE.  sigaction is not really simple, either.  It
has its own flags, and the kernel might enhance the system call in
unexpected ways, too.

Just to be clear, I think it's worthwhile to optimize this for the
reasons you indicated.  We merely disagree about the means. 8-)

Thanks,
Florian

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2019-10-08 17:38           ` Adhemerval Zanella
  2019-10-08 18:52             ` Florian Weimer
@ 2020-11-02 19:51             ` Joseph Myers
  2020-11-02 22:10               ` Adhemerval Zanella
  1 sibling, 1 reply; 59+ messages in thread
From: Joseph Myers @ 2020-11-02 19:51 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: Florian Weimer, libc-alpha

On Tue, 8 Oct 2019, Adhemerval Zanella wrote:

> +      memcpy (((char *)(dp) + offsetof (struct dirent64, d_ino)),
> +	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),
> +	      sizeof ((struct dirent64){0}.d_ino));
> +      memcpy (((char *)(dp) + offsetof (struct dirent64, d_off)),
> +	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),
> +	      sizeof ((struct dirent64){0}.d_ino));

(This is slightly different from the version of the code that ended up 
getting committed.)

GCC mainline now gives a rather cryptic error about this code:

../sysdeps/unix/sysv/linux/mips/mips64/getdents64.c: In function '__getdents64':
../sysdeps/unix/sysv/linux/mips/mips64/getdents64.c:121:7: error: 'memcpy' forming offset [4, 7] is out of the bounds [0, 4] [-Werror=array-bounds]
  121 |       memcpy (((char *) dp + offsetof (struct dirent64, d_ino)),
      |       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  122 |               KDP_MEMBER (kdp, d_ino), sizeof ((struct dirent64){0}.d_ino));
      |               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
../sysdeps/unix/sysv/linux/mips/mips64/getdents64.c:123:7: error: 'memcpy' forming offset [4, 7] is out of the bounds [0, 4] [-Werror=array-bounds]
  123 |       memcpy (((char *) dp + offsetof (struct dirent64, d_off)),
      |       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  124 |               KDP_MEMBER (kdp, d_off), sizeof ((struct dirent64){0}.d_off));
      |               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

I think what this error is pointing out is that the field in 
kernel_dirent, for non-n64, is 32-bit, while this is using memcpy to copy 
64 bits from it into the glibc dirent64, which obviously doesn't work.

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2020-11-02 19:51             ` Joseph Myers
@ 2020-11-02 22:10               ` Adhemerval Zanella
  2020-11-03 10:27                 ` Florian Weimer
  0 siblings, 1 reply; 59+ messages in thread
From: Adhemerval Zanella @ 2020-11-02 22:10 UTC (permalink / raw)
  To: Joseph Myers; +Cc: Florian Weimer, libc-alpha



On 02/11/2020 16:51, Joseph Myers wrote:
> On Tue, 8 Oct 2019, Adhemerval Zanella wrote:
> 
>> +      memcpy (((char *)(dp) + offsetof (struct dirent64, d_ino)),
>> +	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),
>> +	      sizeof ((struct dirent64){0}.d_ino));
>> +      memcpy (((char *)(dp) + offsetof (struct dirent64, d_off)),
>> +	      DP_MEMBER (kdp, struct kernel_dirent, d_ino),
>> +	      sizeof ((struct dirent64){0}.d_ino));
> 
> (This is slightly different from the version of the code that ended up 
> getting committed.)
> 
> GCC mainline now gives a rather cryptic error about this code:
> 
> ../sysdeps/unix/sysv/linux/mips/mips64/getdents64.c: In function '__getdents64':
> ../sysdeps/unix/sysv/linux/mips/mips64/getdents64.c:121:7: error: 'memcpy' forming offset [4, 7] is out of the bounds [0, 4] [-Werror=array-bounds]
>   121 |       memcpy (((char *) dp + offsetof (struct dirent64, d_ino)),
>       |       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>   122 |               KDP_MEMBER (kdp, d_ino), sizeof ((struct dirent64){0}.d_ino));
>       |               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> ../sysdeps/unix/sysv/linux/mips/mips64/getdents64.c:123:7: error: 'memcpy' forming offset [4, 7] is out of the bounds [0, 4] [-Werror=array-bounds]
>   123 |       memcpy (((char *) dp + offsetof (struct dirent64, d_off)),
>       |       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
>   124 |               KDP_MEMBER (kdp, d_off), sizeof ((struct dirent64){0}.d_off));
>       |               ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> 
> I think what this error is pointing out is that the field in 
> kernel_dirent, for non-n64, is 32-bit, while this is using memcpy to copy 
> 64 bits from it into the glibc dirent64, which obviously doesn't work.
> 

I was trying to be too clever to avoid a temporary variable to handle 
mips64n32.  I think the below should handle the issue raised by GCC11,
I will just check some on mips64 machine from gcc farm before send the
fix.

---

diff --git a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
index d18a5297dc..2ea1369ef4 100644
--- a/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
+++ b/sysdeps/unix/sysv/linux/mips/mips64/getdents64.c
@@ -90,16 +90,29 @@ __getdents64 (int fd, void *buf, size_t nbytes)
 
   while ((char *) kdp < (char *) skdp + r)
     {
-      /* This macro is used to avoid aliasing violation.  */
-#define KDP_MEMBER(src, member)			     			\
-    (__typeof__((struct kernel_dirent){0}.member) *)			\
-      memcpy (&((__typeof__((struct kernel_dirent){0}.member)){0}),	\
-	      ((char *)(src) + offsetof (struct kernel_dirent, member)),\
-	      sizeof ((struct kernel_dirent){0}.member))
+#define KDP_MEMBER(src, member)						     \
+      ({								     \
+	__typeof ((struct kernel_dirent){0}.member) kdp_tmp;		     \
+	memcpy (&kdp_tmp,						     \
+		((char *)(src) + offsetof (struct kernel_dirent, member)),   \
+		sizeof (kdp_tmp));					     \
+	kdp_tmp;							     \
+      })
+
+      /* Copy the MEMBER from SRC kernel_dirent to DST dirent64.  It handles
+	 the different size of d_off/d_ino for mips64-n32 by using temporary
+	 variables.  */
+#define COPY_MEMBER(src, dst, member)					     \
+      ({								     \
+	__typeof ((struct dirent64){0}.member) dp_tmp			     \
+	  = KDP_MEMBER (src, member);					     \
+	memcpy ((char *) dp + offsetof (struct dirent64, d_off),	     \
+		&dp_tmp, sizeof (dp_tmp));				     \
+      })
 
       /* This is a conservative approximation, since some of size_diff might
 	 fit into the existing padding for alignment.  */
-      unsigned short int k_reclen = *KDP_MEMBER (kdp, d_reclen);
+      unsigned short int k_reclen = KDP_MEMBER (kdp, d_reclen);
       unsigned short int new_reclen = ALIGN_UP (k_reclen + size_diff,
 						_Alignof (struct dirent64));
       if (nb + new_reclen > nbytes)
@@ -118,11 +131,10 @@ __getdents64 (int fd, void *buf, size_t nbytes)
 	}
       nb += new_reclen;
 
-      memcpy (((char *) dp + offsetof (struct dirent64, d_ino)),
-	      KDP_MEMBER (kdp, d_ino), sizeof ((struct dirent64){0}.d_ino));
-      memcpy (((char *) dp + offsetof (struct dirent64, d_off)),
-	      KDP_MEMBER (kdp, d_off), sizeof ((struct dirent64){0}.d_off));
-      last_offset = *KDP_MEMBER (kdp, d_off);
+      COPY_MEMBER (dp, kdp, d_off);
+      COPY_MEMBER (dp, kdp, d_ino);
+
+      last_offset = KDP_MEMBER (kdp, d_off);
       memcpy (((char *) dp + offsetof (struct dirent64, d_reclen)),
 	      &new_reclen, sizeof (new_reclen));
       dp->d_type = *((char *) kdp + k_reclen - 1);

^ permalink raw reply	[flat|nested] 59+ messages in thread

* Re: [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback
  2020-11-02 22:10               ` Adhemerval Zanella
@ 2020-11-03 10:27                 ` Florian Weimer
  0 siblings, 0 replies; 59+ messages in thread
From: Florian Weimer @ 2020-11-03 10:27 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: Joseph Myers, libc-alpha

* Adhemerval Zanella:

> I was trying to be too clever to avoid a temporary variable to handle 
> mips64n32.  I think the below should handle the issue raised by GCC11,
> I will just check some on mips64 machine from gcc farm before send the
> fix.

I think at this point it might be clearer two have two new structs with
the first few fields of the actual structs, use memcpy on the whole
structs, and a field-by-field copy between the structs to perform the
type adjustment.

It's not that we expect the layout of the structs involved to change.

thanks,
Florian
-- 
Red Hat GmbH, https://de.redhat.com/ , Registered seat: Grasbrunn,
Commercial register: Amtsgericht Muenchen, HRB 153243,
Managing Directors: Charles Cachera, Brian Klemm, Laurie Krebs, Michael O'Neill


^ permalink raw reply	[flat|nested] 59+ messages in thread

end of thread, other threads:[~2020-11-03 10:28 UTC | newest]

Thread overview: 59+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-07-31 18:31 [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback Adhemerval Zanella
2019-07-31 18:31 ` [PATCH 4/5] linux: Optimize posix_spawn spurious sigaction calls Adhemerval Zanella
2019-08-28 14:09   ` Adhemerval Zanella
2019-08-29  8:38   ` Florian Weimer
2019-08-29 11:26     ` Adhemerval Zanella
2019-08-30 10:07       ` Florian Weimer
2019-08-30 13:05         ` Adhemerval Zanella
2019-09-02 13:14           ` Florian Weimer
2019-09-02 19:47             ` Adhemerval Zanella
2019-10-07 17:51               ` Adhemerval Zanella
2019-10-07 18:25                 ` Christian Brauner
2019-10-07 18:32                   ` Florian Weimer
2019-10-07 21:08                     ` Christian Brauner
2019-10-07 18:35                   ` Adhemerval Zanella
2019-10-07 18:40                     ` Florian Weimer
2019-10-07 19:20                       ` Adhemerval Zanella
2019-10-09  9:37                         ` Florian Weimer
2019-10-09 10:25                           ` Christian Brauner
2019-10-09 12:17                           ` Adhemerval Zanella
2019-10-09 19:16                             ` Florian Weimer
2019-10-07 21:00                     ` Joseph Myers
2019-10-07 18:41                 ` Florian Weimer
2019-07-31 18:31 ` [PATCH v2 3/5] posix: Optimize stack Linux posix_spawn Adhemerval Zanella
2019-08-28 14:09   ` Adhemerval Zanella
2019-10-07 17:50     ` Adhemerval Zanella
2019-07-31 18:31 ` [PATCH v2 2/5] posix: Add posix_spawn_file_actions_closefrom Adhemerval Zanella
2019-08-28 14:09   ` Adhemerval Zanella
2019-08-28 17:22     ` Joseph Myers
2019-08-28 21:03       ` Adhemerval Zanella
2019-07-31 18:31 ` [PATCH v2 5/5] posix: Use posix_spawn for wordexp Adhemerval Zanella
2019-08-28 14:10   ` Adhemerval Zanella
2019-10-07 17:51     ` Adhemerval Zanella
2019-10-07 19:33   ` Florian Weimer
2019-10-07 21:04     ` Carlos O'Donell
2019-10-08  9:58       ` Florian Weimer
2019-10-08 17:41     ` Adhemerval Zanella
2019-10-09  9:11       ` Florian Weimer
2019-10-09 12:18         ` Adhemerval Zanella
2019-08-28 14:09 ` [PATCH v2 1/5] mips: Do not malloc on getdents64 fallback Adhemerval Zanella
2019-08-28 14:35 ` Andreas Schwab
2019-08-28 17:01   ` Adhemerval Zanella
2019-08-28 14:42 ` Florian Weimer
2019-08-28 21:02   ` Adhemerval Zanella
2019-08-28 21:23     ` Florian Weimer
2019-08-29 11:04       ` Adhemerval Zanella
2019-08-30  9:53 ` Florian Weimer
2019-08-30 12:53   ` Adhemerval Zanella
2019-09-02 12:59     ` Florian Weimer
2019-09-02 17:38       ` Adhemerval Zanella
2019-10-07 17:49         ` Adhemerval Zanella
2019-10-07 18:29         ` Florian Weimer
2019-10-08 17:38           ` Adhemerval Zanella
2019-10-08 18:52             ` Florian Weimer
2019-10-08 19:52               ` Adhemerval Zanella
2019-10-08 19:59                 ` Florian Weimer
2019-10-09 13:02                   ` Adhemerval Zanella
2020-11-02 19:51             ` Joseph Myers
2020-11-02 22:10               ` Adhemerval Zanella
2020-11-03 10:27                 ` Florian Weimer

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).