public inbox for cygwin-patches@cygwin.com
 help / color / mirror / Atom feed
* [PATCH] Cygwin: rewrite cmdline parser
@ 2020-06-24 22:35 Mingye Wang
  2020-06-25 14:43 ` [PATCH v2] " Mingye Wang
  0 siblings, 1 reply; 6+ messages in thread
From: Mingye Wang @ 2020-06-24 22:35 UTC (permalink / raw)
  To: cygwin-patches; +Cc: Mingye Wang

This commit rewrites the cmdline parser to achieve the following:
* MSVCRT compatibility. Except for the single-quote handling (an
  extension for compatibility with old Cygwin), the parser now
  interprets option boundaries exactly like MSVCR since 2008. This fixes
  the issue where our escaping does not work with our own parsing.
* Clarity. Since globify() is no longer responsible for handling the
  opening and closing of quotes, the code is much simpler.
* Sanity. The GLOB_NOCHECK flag is removed, so a failed glob correctly
  returns the literal value. Without the change, anything path-like
  would be garbled by globify's escaping.

The change fixes two complaints of mine:
* That cygwin is incompatible with its own escape.[1]
* That there is no way to echo `C:\"` from win32.[2]
  [1]: https://cygwin.com/pipermail/cygwin/2020-June/245162.html
  [2]: https://cygwin.com/pipermail/cygwin/2019-October/242790.html

(It's never the point to spawn cygwin32 from cygwin64. Consistency
matters: with yourself always, and with the outside world when you are
supposed to.)
---
 winsup/cygwin/dcrt0.cc | 192 +++++++++++++++++++----------------------
 1 file changed, 88 insertions(+), 104 deletions(-)

diff --git a/winsup/cygwin/dcrt0.cc b/winsup/cygwin/dcrt0.cc
index 5d8b4b74e..71fdda294 100644
--- a/winsup/cygwin/dcrt0.cc
+++ b/winsup/cygwin/dcrt0.cc
@@ -151,43 +151,52 @@ isquote (char c)
   return ch == '"' || ch == '\'';
 }
 
-/* Step over a run of characters delimited by quotes */
+
+/* MSVCRT-like argument parsing.
+ * Parse a word, consuming characters and marking where quoting state is changed. */
 static /*__inline*/ char *
-quoted (char *cmd, int winshell)
+next_arg (char *cmd, char *arg, size_t* quotepos)
 {
-  char *p;
-  char quote = *cmd;
+  int inquote = 0;
+  int nbs = 0;
+  char *start = cmd;
+  char quote = '\0';
 
-  if (!winshell)
+  while (*cmd && (inquote || !issep(*cmd)))
     {
-      char *p;
-      strcpy (cmd, cmd + 1);
-      if (*(p = strchrnul (cmd, quote)))
-	strcpy (p, p + 1);
-      return p;
-    }
+      if (*cmd == '\\')
+	{
+	  nbs += 1;
+	  continue;
+	}
 
-  const char *s = quote == '\'' ? "'" : "\\\"";
-  /* This must have been run from a Windows shell, so preserve
-     quotes for globify to play with later. */
-  while (*cmd && *++cmd)
-    if ((p = strpbrk (cmd, s)) == NULL)
-      {
-	cmd = strchr (cmd, '\0');	// no closing quote
-	break;
-      }
-    else if (*p == '\\')
-      cmd = ++p;
-    else if (quote == '"' && p[1] == '"')
-      {
-	*p = '\\';
-	cmd = ++p;			// a quoted quote
-      }
-    else
-      {
-	cmd = p + 1;		// point to after end
-	break;
-      }
+      // For anything else, sort out backslashes first.
+      memset(arg, '\\', nbs / 2);
+      arg += nbs / 2;
+
+      // Single-quote is our addition.
+      if (nbs % 2 == 0 && (inquote ? *cmd == quote : isquote(*cmd)))
+	{
+	  // The infamous "" special case: emit literal '"', no change.
+	  if (inquote && *cmd == '"' && cmd[1] == '"')
+	    *arg++ = *cmd++;
+	  else
+	    {
+	      if (!inquote)
+		quote = *cmd;
+	      inquote = !inquote;
+	      *quotepos++ = cmd - start;
+	    }
+	}
+      else
+	{
+	  *arg++ = *cmd;
+	}
+
+      nbs = 0;
+      cmd++;
+    }
+  *arg = '\0';
   return cmd;
 }
 
@@ -202,67 +211,60 @@ quoted (char *cmd, int winshell)
 			    && isalpha ((s)[2]) \
 			    && strchr ((s) + 3, '\\')))
 
+/* Call glob(3) on the word, and fill argv accordingly.
+ * If the input looks like a DOS path, double up backslashes.
+ */
 static int __stdcall
-globify (char *word, char **&argv, int &argc, int &argvlen)
+globify (char *word, size_t *quotepos, char **&argv, int &argc, int &argvlen)
 {
   if (*word != '~' && strpbrk (word, "?*[\"\'(){}") == NULL)
     return 0;
 
-  int n = 0;
-  char *p, *s;
-  int dos_spec = is_dos_path (word);
-  if (!dos_spec && isquote (*word) && word[1] && word[2])
-    dos_spec = is_dos_path (word + 1);
-
   /* We'll need more space if there are quoting characters in
      word.  If that is the case, doubling the size of the
      string should provide more than enough space. */
-  if (strpbrk (word, "'\""))
-    n = strlen (word);
+  int n = quotepos[0] == SIZE_MAX ? 0 : strlen(word);
+  char *p, *s;
+  int dos_spec = is_dos_path (word);
   char pattern[strlen (word) + ((dos_spec + 1) * n) + 1];
+  int inquote = 0;
 
   /* Fill pattern with characters from word, quoting any
      characters found within quotes. */
   for (p = pattern, s = word; *s != '\000'; s++, p++)
-    if (!isquote (*s))
-      {
-	if (dos_spec && *s == '\\')
-	  *p++ = '\\';
-	*p = *s;
-      }
-    else
-      {
-	char quote = *s;
-	while (*++s && *s != quote)
-	  {
-	    if (dos_spec || *s != '\\')
-	      /* nothing */;
-	    else if (s[1] == quote || s[1] == '\\')
-	      s++;
+    {
+      if (*quotepos == s - word)
+	{
+	  inquote = !inquote;
+	  quotepos++;
+	}
+      if (!inquote)
+	{
+	  if (dos_spec && *s == '\\')
 	    *p++ = '\\';
-	    size_t cnt = isascii (*s) ? 1 : mbtowc (NULL, s, MB_CUR_MAX);
-	    if (cnt <= 1 || cnt == (size_t)-1)
-	      *p++ = *s;
-	    else
-	      {
-		--s;
-		while (cnt-- > 0)
-		  *p++ = *++s;
-	      }
-	  }
-	if (*s == quote)
-	  p--;
-	if (*s == '\0')
-	    break;
-      }
-
+	  *p = *s;
+	}
+      else
+	{
+	  *p++ = '\\';
+	  int cnt = isascii (*s) ? 1 : mbtowc (NULL, s, MB_CUR_MAX);
+	  if (cnt <= 1)
+	    *p = *s;
+	  else
+	    {
+	      memcpy(p, s, cnt);
+	      p += cnt - 1;
+	      s += cnt - 1;
+	    }
+	}
+    }
   *p = '\0';
 
   glob_t gl;
   gl.gl_offs = 0;
 
   /* Attempt to match the argument.  Return just word (minus quoting) if no match. */
-  if (glob (pattern, GLOB_TILDE | GLOB_NOCHECK | GLOB_BRACE | GLOB_QUOTE, NULL, &gl) || !gl.gl_pathc)
+  if (glob (pattern, GLOB_TILDE | GLOB_BRACE | GLOB_QUOTE, NULL, &gl) || !gl.gl_pathc)
     return 0;
 
   /* Allocate enough space in argv for the matched filenames. */
@@ -288,12 +290,14 @@ globify (char *word, char **&argv, int &argc, int &argvlen)
 }
 
 /* Build argv, argc from string passed from Windows.  */
-
 static void __stdcall
-build_argv (char *cmd, char **&argv, int &argc, int winshell)
+build_argv (char *cmd, char **&argv, int &argc, int doglob)
 {
   int argvlen = 0;
   int nesting = 0;		// monitor "nesting" from insert_file
+  char *word = (char *) malloc(strlen(cmd) + 1);
+  // Locations for quote transition. We can trim this down to 32767 later.
+  size_t *quotepos = (size_t *) malloc(strlen(cmd) * sizeof(size_t));
 
   argc = 0;
   argvlen = 0;
@@ -302,39 +306,17 @@ build_argv (char *cmd, char **&argv, int &argc, int winshell)
   /* Scan command line until there is nothing left. */
   while (*cmd)
     {
-      /* Ignore spaces */
-      if (issep (*cmd))
+      while (issep(*cmd))
 	{
 	  cmd++;
 	  continue;
 	}
-
-      /* Found the beginning of an argument. */
-      char *word = cmd;
-      char *sawquote = NULL;
-      while (*cmd)
-	{
-	  if (*cmd != '"' && (!winshell || *cmd != '\''))
-	    cmd++;		// Skip over this character
-	  else
-	    /* Skip over characters until the closing quote */
-	    {
-	      sawquote = cmd;
-	      /* Handle quoting.  Only strip off quotes if the parent is
-		 a Cygwin process, or if the word starts with a '@'.
-		 In this case, the insert_file function needs an unquoted
-		 DOS filename and globbing isn't performed anyway. */
-	      cmd = quoted (cmd, winshell && argc > 0 && *word != '@');
-	    }
-	  if (issep (*cmd))	// End of argument if space
-	    break;
-	}
-      if (*cmd)
-	*cmd++ = '\0';		// Terminate `word'
+      quotepos[0] = SIZE_MAX;
+      cmd = next_arg (cmd, word, quotepos);
 
       /* Possibly look for @file construction assuming that this isn't
 	 the very first argument and the @ wasn't quoted */
-      if (argc && sawquote != word && *word == '@')
+      if (argc && quotepos[0] > 1 && *word == '@')
 	{
 	  if (++nesting > MAX_AT_FILE_LEVEL)
 	    api_fatal ("Too many levels of nesting for %s", word);
@@ -350,16 +332,18 @@ build_argv (char *cmd, char **&argv, int &argc, int winshell)
 	}
 
       /* Add word to argv file after (optional) wildcard expansion. */
-      if (!winshell || !argc || !globify (word, argv, argc, argvlen))
+      if (!doglob || !argc || !globify (word, quotepos, argv, argc, argvlen))
 	{
 	  debug_printf ("argv[%d] = '%s'", argc, word);
-	  argv[argc++] = word;
+	  argv[argc++] = strdup(word);
 	}
     }
 
   if (argv)
     argv[argc] = NULL;
 
+  free(word);
+  free(quotepos);
   debug_printf ("argc %d", argc);
 }
 
@@ -1064,7 +1048,7 @@ _dll_crt0 ()
 	  if (stackaddr)
 	    {
 	      /* Set stack pointer to new address.  Set frame pointer to
-	         stack pointer and subtract 32 bytes for shadow space. */
+		 stack pointer and subtract 32 bytes for shadow space. */
 	      __asm__ ("\n\
 		       movq %[ADDR], %%rsp \n\
 		       movq  %%rsp, %%rbp  \n\
-- 
2.20.1.windows.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH v2] Cygwin: rewrite cmdline parser
  2020-06-24 22:35 [PATCH] Cygwin: rewrite cmdline parser Mingye Wang
@ 2020-06-25 14:43 ` Mingye Wang
  2020-06-25 14:49   ` Mingye Wang
  2020-06-30 10:30   ` Corinna Vinschen
  0 siblings, 2 replies; 6+ messages in thread
From: Mingye Wang @ 2020-06-25 14:43 UTC (permalink / raw)
  To: cygwin-patches; +Cc: Mingye Wang

This commit rewrites the cmdline parser to achieve the following:
* MSVCRT compatibility. Except for the single-quote handling (an
  extension for compatibility with old Cygwin), the parser now
  interprets option boundaries exactly like MSVCR since 2008. This fixes
  the issue where our escaping does not work with our own parsing.
* Clarity. Since globify() is no longer responsible for handling the
  opening and closing of quotes, the code is much simpler.
* Sanity. The GLOB_NOCHECK flag is removed, so a failed glob correctly
  returns the literal value. Without the change, anything path-like
  would be garbled by globify's escaping.

Some clarifications are made in the documentation for when globs are not
expanded.  A minor change was made to insert_file to remove the memory
leak with multiple files.

The change fixes two complaints of mine:
* That cygwin is incompatible with its own escape.[1]
* That there is no way to echo `C:\"` from win32.[2]
  [1]: https://cygwin.com/pipermail/cygwin/2020-June/245162.html
  [2]: https://cygwin.com/pipermail/cygwin/2019-October/242790.html

(It's never the point to spawn cygwin32 from cygwin64. Consistency
matters: with yourself always, and with the outside world when you are
supposed to.)

---

This is the second version of the patch.  Changes include:
* Remove unnecessary allocations (do word in-place, realloc quotepos).
* Match MSVCRT backslash rules for unquoted.
* Make quotepos actually refer to out (smoke test).
* Pack up argument iteration a bit better.
* Edit documentation and insert_file.
---
 winsup/cygwin/dcrt0.cc   | 228 +++++++++++++++++++--------------------
 winsup/cygwin/winsup.h   |   4 +
 winsup/doc/cygwinenv.xml |   8 +-
 winsup/doc/faq-api.xml   |   2 +-
 4 files changed, 121 insertions(+), 121 deletions(-)

diff --git a/winsup/cygwin/dcrt0.cc b/winsup/cygwin/dcrt0.cc
index 5d8b4b74e..d16a24dd9 100644
--- a/winsup/cygwin/dcrt0.cc
+++ b/winsup/cygwin/dcrt0.cc
@@ -84,7 +84,7 @@ do_global_ctors (void (**in_pfunc)(), int force)
  * @foo and not the contents of foo.
  */
 static bool __stdcall
-insert_file (char *name, char *&cmd)
+insert_file (const char *name, char *&cmd, int free_old)
 {
   HANDLE f;
   DWORD size;
@@ -140,6 +140,8 @@ insert_file (char *name, char *&cmd)
 
   tmp[size++] = ' ';
   strcpy (tmp + size, cmd);
+  if (free_old)
+    free (cmd);
   cmd = tmp;
   return true;
 }
@@ -151,44 +153,68 @@ isquote (char c)
   return ch == '"' || ch == '\'';
 }
 
-/* Step over a run of characters delimited by quotes */
-static /*__inline*/ char *
-quoted (char *cmd, int winshell)
+/* MSVCRT-like argument parsing.
+ * Parse a word in-place, consuming characters and marking where quoting state is changed. */
+static /*__inline*/ bool
+next_arg (char *&cmd, char *&arg, size_t* quotepos, size_t &quotesize)
 {
-  char *p;
-  char quote = *cmd;
+  bool inquote = false;
+  size_t nbs = 0;
+  char quote = '\0';
+  quotepos[0] = SIZE_MAX;
+  size_t nquotes = 0;
+
+  while (*cmd && issep (*cmd))
+    cmd++;
 
-  if (!winshell)
+  arg = cmd;
+  char *out = arg;
+
+  for (;*cmd && (inquote || !issep (*cmd)); cmd++)
     {
-      char *p;
-      strcpy (cmd, cmd + 1);
-      if (*(p = strchrnul (cmd, quote)))
-	strcpy (p, p + 1);
-      return p;
+      if (*cmd == '\\')
+	{
+	  nbs += 1;
+	  continue;
+	}
+
+      // For anything else, sort out backslashes first.
+      memset (out, '\\', inquote ? nbs / 2 : nbs);
+      out += inquote ? nbs / 2 : nbs;
+
+      // Single-quote is our addition.  Would love to remove it.
+      if (nbs % 2 == 0 && (inquote ? *cmd == quote : isquote (*cmd)))
+	{
+	  /* The infamous "" special case: emit literal '"', no change.
+	   *
+	   * Makes quotepos tracking easier, so applies to single quote too:
+	   * without this handling, an out pos can contain many state changes,
+	   * so a check must be done before appending. */
+	  if (inquote && *cmd == quote && cmd[1] == quote)
+	    *out++ = *cmd++;
+	  else
+	    {
+	      if (!inquote)
+		quote = *cmd;
+	      if (++nquotes >= quotesize)
+		quotepos = realloc_type(quotepos, quotesize *= 2, size_t);
+	      quotepos[nquotes] = out - arg + inquote;
+	      inquote = !inquote;
+	    }
+	}
+      else
+	{
+	  *out++ = *cmd;
+	}
+
+      nbs = 0;
     }
 
-  const char *s = quote == '\'' ? "'" : "\\\"";
-  /* This must have been run from a Windows shell, so preserve
-     quotes for globify to play with later. */
-  while (*cmd && *++cmd)
-    if ((p = strpbrk (cmd, s)) == NULL)
-      {
-	cmd = strchr (cmd, '\0');	// no closing quote
-	break;
-      }
-    else if (*p == '\\')
-      cmd = ++p;
-    else if (quote == '"' && p[1] == '"')
-      {
-	*p = '\\';
-	cmd = ++p;			// a quoted quote
-      }
-    else
-      {
-	cmd = p + 1;		// point to after end
-	break;
-      }
-  return cmd;
+  if (*cmd)
+    cmd++;
+
+  *out = '\0';
+  return arg != cmd;
 }
 
 /* Perform a glob on word if it contains wildcard characters.
@@ -202,67 +228,62 @@ quoted (char *cmd, int winshell)
 			    && isalpha ((s)[2]) \
 			    && strchr ((s) + 3, '\\')))
 
+/* Call glob(3) on the word, and fill argv accordingly.
+ * If the input looks like a DOS path, double up backslashes.
+ */
 static int __stdcall
-globify (char *word, char **&argv, int &argc, int &argvlen)
+globify (const char *word, size_t *quotepos, size_t quotesize, char **&argv, int &argc, int &argvlen)
 {
   if (*word != '~' && strpbrk (word, "?*[\"\'(){}") == NULL)
     return 0;
 
-  int n = 0;
-  char *p, *s;
-  int dos_spec = is_dos_path (word);
-  if (!dos_spec && isquote (*word) && word[1] && word[2])
-    dos_spec = is_dos_path (word + 1);
-
   /* We'll need more space if there are quoting characters in
      word.  If that is the case, doubling the size of the
      string should provide more than enough space. */
-  if (strpbrk (word, "'\""))
-    n = strlen (word);
+  size_t n = quotepos[0] == SIZE_MAX ? 0 : strlen (word);
+  char *p;
+  const char *s;
+  int dos_spec = is_dos_path (word);
   char pattern[strlen (word) + ((dos_spec + 1) * n) + 1];
+  bool inquote = false;
+  size_t nquotes = 0;
 
   /* Fill pattern with characters from word, quoting any
      characters found within quotes. */
   for (p = pattern, s = word; *s != '\000'; s++, p++)
-    if (!isquote (*s))
-      {
-	if (dos_spec && *s == '\\')
-	  *p++ = '\\';
-	*p = *s;
-      }
-    else
-      {
-	char quote = *s;
-	while (*++s && *s != quote)
-	  {
-	    if (dos_spec || *s != '\\')
-	      /* nothing */;
-	    else if (s[1] == quote || s[1] == '\\')
-	      s++;
+    {
+      if (nquotes < quotesize && quotepos[nquotes] == s - word)
+	{
+	  inquote = !inquote;
+	  nquotes++;
+	}
+      if (!inquote)
+	{
+	  if (dos_spec && *s == '\\')
 	    *p++ = '\\';
-	    size_t cnt = isascii (*s) ? 1 : mbtowc (NULL, s, MB_CUR_MAX);
-	    if (cnt <= 1 || cnt == (size_t)-1)
-	      *p++ = *s;
-	    else
-	      {
-		--s;
-		while (cnt-- > 0)
-		  *p++ = *++s;
-	      }
-	  }
-	if (*s == quote)
-	  p--;
-	if (*s == '\0')
-	    break;
-      }
-
+	  *p = *s;
+	}
+      else
+	{
+	  *p++ = '\\';
+	  int cnt = isascii (*s) ? 1 : mbtowc (NULL, s, MB_CUR_MAX);
+	  if (cnt <= 1)
+	    *p = *s;
+	  else
+	    {
+	      memcpy (p, s, cnt);
+	      p += cnt - 1;
+	      s += cnt - 1;
+	    }
+	}
+    }
   *p = '\0';
 
   glob_t gl;
   gl.gl_offs = 0;
 
-  /* Attempt to match the argument.  Return just word (minus quoting) if no match. */
-  if (glob (pattern, GLOB_TILDE | GLOB_NOCHECK | GLOB_BRACE | GLOB_QUOTE, NULL, &gl) || !gl.gl_pathc)
+  /* Attempt to match the argument.  Bail if no match. */
+  if (glob (pattern, GLOB_TILDE | GLOB_BRACE, NULL, &gl) || !gl.gl_pathc)
     return 0;
 
   /* Allocate enough space in argv for the matched filenames. */
@@ -278,7 +299,7 @@ globify (char *word, char **&argv, int &argc, int &argvlen)
   char **av = argv + n;
   while (*gv)
     {
-      debug_printf ("argv[%d] = '%s'", n++, *gv);
+      debug_printf ("argv[%zu] = '%s'", n++, *gv);
       *av++ = *gv++;
     }
 
@@ -288,58 +309,31 @@ globify (char *word, char **&argv, int &argc, int &argvlen)
 }
 
 /* Build argv, argc from string passed from Windows.  */
-
 static void __stdcall
-build_argv (char *cmd, char **&argv, int &argc, int winshell)
+build_argv (char *cmd, char **&argv, int &argc, int doglob)
 {
   int argvlen = 0;
   int nesting = 0;		// monitor "nesting" from insert_file
 
+  // Would be a bad idea to use alloca due to insert_file.
+  size_t quotesize = 32;
+  size_t *quotepos = malloc_type(quotesize, size_t);
+
   argc = 0;
   argvlen = 0;
   argv = NULL;
 
   /* Scan command line until there is nothing left. */
-  while (*cmd)
+  while (next_arg (cmd, word, quotepos, quotesize))
     {
-      /* Ignore spaces */
-      if (issep (*cmd))
-	{
-	  cmd++;
-	  continue;
-	}
-
-      /* Found the beginning of an argument. */
-      char *word = cmd;
-      char *sawquote = NULL;
-      while (*cmd)
-	{
-	  if (*cmd != '"' && (!winshell || *cmd != '\''))
-	    cmd++;		// Skip over this character
-	  else
-	    /* Skip over characters until the closing quote */
-	    {
-	      sawquote = cmd;
-	      /* Handle quoting.  Only strip off quotes if the parent is
-		 a Cygwin process, or if the word starts with a '@'.
-		 In this case, the insert_file function needs an unquoted
-		 DOS filename and globbing isn't performed anyway. */
-	      cmd = quoted (cmd, winshell && argc > 0 && *word != '@');
-	    }
-	  if (issep (*cmd))	// End of argument if space
-	    break;
-	}
-      if (*cmd)
-	*cmd++ = '\0';		// Terminate `word'
-
       /* Possibly look for @file construction assuming that this isn't
 	 the very first argument and the @ wasn't quoted */
-      if (argc && sawquote != word && *word == '@')
+      if (argc && quotepos[0] > 1 && *word == '@')
 	{
 	  if (++nesting > MAX_AT_FILE_LEVEL)
 	    api_fatal ("Too many levels of nesting for %s", word);
-	  if (insert_file (word, cmd))
-	      continue;			// There's new stuff in cmd now
+	  if (insert_file (word, cmd, nesting - 1))
+	    continue;  // There's new stuff in cmd now
 	}
 
       /* See if we need to allocate more space for argv */
@@ -350,16 +344,18 @@ build_argv (char *cmd, char **&argv, int &argc, int winshell)
 	}
 
       /* Add word to argv file after (optional) wildcard expansion. */
-      if (!winshell || !argc || !globify (word, argv, argc, argvlen))
+      if (!doglob || !argc || !globify (word, quotepos, quotesize, argv, argc, argvlen))
 	{
 	  debug_printf ("argv[%d] = '%s'", argc, word);
-	  argv[argc++] = word;
+	  argv[argc++] = strdup (word);
 	}
     }
 
   if (argv)
     argv[argc] = NULL;
 
+  free (word);
+  free (quotepos);
   debug_printf ("argc %d", argc);
 }
 
@@ -1064,7 +1060,7 @@ _dll_crt0 ()
 	  if (stackaddr)
 	    {
 	      /* Set stack pointer to new address.  Set frame pointer to
-	         stack pointer and subtract 32 bytes for shadow space. */
+		 stack pointer and subtract 32 bytes for shadow space. */
 	      __asm__ ("\n\
 		       movq %[ADDR], %%rsp \n\
 		       movq  %%rsp, %%rbp  \n\
diff --git a/winsup/cygwin/winsup.h b/winsup/cygwin/winsup.h
index fff7d18f3..f09d6d89e 100644
--- a/winsup/cygwin/winsup.h
+++ b/winsup/cygwin/winsup.h
@@ -151,6 +151,10 @@ extern int cygserver_running;
 #define isabspath(p) \
   (isdirsep (*(p)) || (isalpha (*(p)) && (p)[1] == ':' && (!(p)[2] || isdirsep ((p)[2]))))
 
+/* Shortcut.  See also std::add_pointer. */
+#define malloc_type(n, type) ((type *) malloc ((n) * sizeof (type)))
+#define realloc_type(b, n, type) ((type *) realloc (b, (n) * sizeof (type)))
+
 /******************** Initialization/Termination **********************/
 
 class per_process;
diff --git a/winsup/doc/cygwinenv.xml b/winsup/doc/cygwinenv.xml
index f549fee0d..54ee93443 100644
--- a/winsup/doc/cygwinenv.xml
+++ b/winsup/doc/cygwinenv.xml
@@ -34,10 +34,10 @@ There is no default set.
 </listitem>
 
 <listitem>
-<para><envar>(no)glob[:ignorecase]</envar> - if set, command line arguments
-containing UNIX-style file wildcard characters (brackets, braces, question mark,
-asterisk, escaped with \) are expanded into lists of files that match 
-those wildcards.
+<para><envar>(no)glob[:ignorecase]</envar> - if set, unquoted
+command line arguments containing UNIX-style file wildcard characters (brackets,
+braces, question mark, asterisk, escaped with \) are expanded into lists of
+files that match those wildcards.  Leading tildes are expanded.
 This is applicable only to programs run from non-Cygwin programs such as a CMD prompt.
 That means that this setting does not affect globbing operations for shells such as
 bash, sh, tcsh, zsh, etc.
diff --git a/winsup/doc/faq-api.xml b/winsup/doc/faq-api.xml
index 313f15d37..f09dd0b11 100644
--- a/winsup/doc/faq-api.xml
+++ b/winsup/doc/faq-api.xml
@@ -169,7 +169,7 @@ for the old executable and any dll into per-user subdirectories in the
 <para>If the DLL thinks it was invoked from a DOS style prompt, it runs a
 `globber' over the arguments provided on the command line.  This means
 that if you type <literal>LS *.EXE</literal> from DOS, it will do what you might
-expect.
+expect.  This only happens to the unquoted parts.
 </para>
 <para>Beware: globbing uses <literal>malloc</literal>.  If your application defines
 <literal>malloc</literal>, that will get used.  This may do horrible things to you.
-- 
2.20.1.windows.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] Cygwin: rewrite cmdline parser
  2020-06-25 14:43 ` [PATCH v2] " Mingye Wang
@ 2020-06-25 14:49   ` Mingye Wang
  2020-06-30 10:30   ` Corinna Vinschen
  1 sibling, 0 replies; 6+ messages in thread
From: Mingye Wang @ 2020-06-25 14:49 UTC (permalink / raw)
  To: cygwin-patches

On 2020/6/25 22:43, Mingye Wang wrote:
> + free (word);

Grrr, this should not be in there. Waiting for reviews and saving that 
up for v3.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] Cygwin: rewrite cmdline parser
  2020-06-25 14:43 ` [PATCH v2] " Mingye Wang
  2020-06-25 14:49   ` Mingye Wang
@ 2020-06-30 10:30   ` Corinna Vinschen
  1 sibling, 0 replies; 6+ messages in thread
From: Corinna Vinschen @ 2020-06-30 10:30 UTC (permalink / raw)
  To: Mingye Wang; +Cc: cygwin-patches

Hi Mingye,

On Jun 25 22:43, Mingye Wang wrote:
> This commit rewrites the cmdline parser to achieve the following:
> * MSVCRT compatibility. Except for the single-quote handling (an
>   extension for compatibility with old Cygwin), the parser now
>   interprets option boundaries exactly like MSVCR since 2008. This fixes
>   the issue where our escaping does not work with our own parsing.
> * Clarity. Since globify() is no longer responsible for handling the
>   opening and closing of quotes, the code is much simpler.
> * Sanity. The GLOB_NOCHECK flag is removed, so a failed glob correctly
>   returns the literal value. Without the change, anything path-like
>   would be garbled by globify's escaping.
> 
> Some clarifications are made in the documentation for when globs are not
> expanded.  A minor change was made to insert_file to remove the memory
> leak with multiple files.
> 
> The change fixes two complaints of mine:
> * That cygwin is incompatible with its own escape.[1]
> * That there is no way to echo `C:\"` from win32.[2]
>   [1]: https://cygwin.com/pipermail/cygwin/2020-June/245162.html
>   [2]: https://cygwin.com/pipermail/cygwin/2019-October/242790.html
> 
> (It's never the point to spawn cygwin32 from cygwin64. Consistency
> matters: with yourself always, and with the outside world when you are
> supposed to.)

Apart from the small free() problem, you mention in your reply to self,
this patch looks great at first glance.

Three questions/requests if you don't mind:

- Would you mind to send the corrected version yet?

- A contribution like this still(*) requires the 2-clause BSD waiver per
  the winsup/CONTRIBUTORS file, see the chapter "Before you get started"
  on https://cygwin.com/contrib.html

- Can you please take a bit of time and try to outline in how far this
  change introduces backward compatibility problems with the old code?
  I don't mean the obvious bug like the backslash problem, but rather
  the question is, what input did something useful before which doesn't
  work the same way now?  I'd like to get a feeling how much this may
  affect existing scripts.


Thanks,
Corinna


(*) IIRC, 2020 is the last year requiring this...

-- 
Corinna Vinschen
Cygwin Maintainer

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] Cygwin: rewrite cmdline parser
  2020-11-07 12:12   ` [PATCH] " Mingye Wang
@ 2020-11-09 10:04     ` Corinna Vinschen
  0 siblings, 0 replies; 6+ messages in thread
From: Corinna Vinschen @ 2020-11-09 10:04 UTC (permalink / raw)
  To: Mingye Wang; +Cc: cygwin-patches

Hi Mingye,

On Nov  7 20:12, Mingye Wang wrote:
> This commit rewrites the cmdline parser to achieve the following:
> * MSVCRT compatibility. Except for the single-quote handling (an
>   extension for compatibility with old Cygwin), the parser now
>   interprets option boundaries exactly like MSVCR since 2008. This fixes
>   the issue where our escaping does not work with our own parsing.
> * Clarity. Since globify() is no longer responsible for handling the
>   opening and closing of quotes, the code is much simpler.
> * Sanity. The GLOB_NOCHECK flag is removed, so a failed glob correctly
>   returns the literal value. Without the change, anything path-like
>   would be garbled by globify's escaping.
> * A memory leak in the @file expansion is removed by rewriting it to use
>   a stack of buffers. This also simplifies the code since we no longer
>   have to move stuff. The "downside" is that tokens can no longer cross
>   file boundaries.
> 
> Some clarifications are made in the documentation for when globs are not
> expanded.
> 
> The change fixes two complaints of mine:
> * That cygwin is incompatible with its own escape.[1]
> * That there is no way to echo `C:\"` from win32.[2]
>   [1]: https://cygwin.com/pipermail/cygwin/2020-June/245162.html
>   [2]: https://cygwin.com/pipermail/cygwin/2019-October/242790.html
> 
> (It's never the point to spawn cygwin32 from cygwin64. Consistency
> matters: with yourself always, and with the outside world when you are
> supposed to.)

This looks already pretty good now, but there's a problem when building:

  winsup/cygwin/winf.cc:67:1: error: no declaration matches ‘bool linebuf::fromargv(av&, const char*, bool, bool)’
     67 | linebuf::fromargv (av& newargv, const char *real_path, bool trunc_for_cygwin, bool forcequote)
	| ^~~~~~~
  In file included from winsup/cygwin/winf.cc:16:
  winsup/cygwin/winf.h:76:15: note: candidate is: ‘bool linebuf::fromargv(av&, const char*, bool)’
     76 |   bool __reg3 fromargv(av&, const char *, bool);;
	|               ^~~~~~~~
  winsup/cygwin/winf.h:64:7: note: ‘class linebuf’ defined here
     64 | class linebuf
	|       ^~~~~~~

The declaration in winf.h is actually missing the "forcequote" arg.  Is
"forcequote" supposed to be a default parameter?  If so, defaulting to
true or false? 

However, given that *both* calls to fromargv() don't set forcequote at
all, it will be the same value for all callers and thus it's entirely
redundant.  So why not just drop it?

Also, this change

> +#if defined (__x86_64__) || defined (__CYGMAGIC__) || !defined (__GNUC__)

is entirely unrelated and should go into it's own patch explaining
why it's necessary.  After all, we're building Cygwin with gcc only...


Thanks,
Corinna

^ permalink raw reply	[flat|nested] 6+ messages in thread

* [PATCH] Cygwin: rewrite cmdline parser
  2020-11-07 12:12 ` [PATCH v5] Cygwin: rewrite cmdline parser Mingye Wang
@ 2020-11-07 12:12   ` Mingye Wang
  2020-11-09 10:04     ` Corinna Vinschen
  0 siblings, 1 reply; 6+ messages in thread
From: Mingye Wang @ 2020-11-07 12:12 UTC (permalink / raw)
  To: cygwin-patches; +Cc: Mingye Wang

This commit rewrites the cmdline parser to achieve the following:
* MSVCRT compatibility. Except for the single-quote handling (an
  extension for compatibility with old Cygwin), the parser now
  interprets option boundaries exactly like MSVCR since 2008. This fixes
  the issue where our escaping does not work with our own parsing.
* Clarity. Since globify() is no longer responsible for handling the
  opening and closing of quotes, the code is much simpler.
* Sanity. The GLOB_NOCHECK flag is removed, so a failed glob correctly
  returns the literal value. Without the change, anything path-like
  would be garbled by globify's escaping.
* A memory leak in the @file expansion is removed by rewriting it to use
  a stack of buffers. This also simplifies the code since we no longer
  have to move stuff. The "downside" is that tokens can no longer cross
  file boundaries.

Some clarifications are made in the documentation for when globs are not
expanded.

The change fixes two complaints of mine:
* That cygwin is incompatible with its own escape.[1]
* That there is no way to echo `C:\"` from win32.[2]
  [1]: https://cygwin.com/pipermail/cygwin/2020-June/245162.html
  [2]: https://cygwin.com/pipermail/cygwin/2019-October/242790.html

(It's never the point to spawn cygwin32 from cygwin64. Consistency
matters: with yourself always, and with the outside world when you are
supposed to.)

This is the fifth version of the patch, applying reviewer suggestions to
keep the interface private. I provide all my patches to Cygwin,
including this one and all future ones, under the 2-clause BSD license.
---
 winsup/cygwin/dcrt0.cc   | 299 +--------------------------------
 winsup/cygwin/regparm.h  |   2 +-
 winsup/cygwin/winf.cc    | 347 ++++++++++++++++++++++++++++++++++++++-
 winsup/cygwin/winf.h     |   2 +
 winsup/cygwin/winsup.h   |   7 +-
 winsup/doc/cygwinenv.xml |   8 +-
 winsup/doc/faq-api.xml   |   2 +-
 7 files changed, 363 insertions(+), 304 deletions(-)

diff --git a/winsup/cygwin/dcrt0.cc b/winsup/cygwin/dcrt0.cc
index 5d8b4b74e..dee7496f7 100644
--- a/winsup/cygwin/dcrt0.cc
+++ b/winsup/cygwin/dcrt0.cc
@@ -10,7 +10,6 @@ details. */
 #include "miscfuncs.h"
 #include <unistd.h>
 #include <stdlib.h>
-#include "glob.h"
 #include <ctype.h>
 #include <locale.h>
 #include <sys/param.h>
@@ -35,6 +34,7 @@ details. */
 #include "cygxdr.h"
 #include "fenv.h"
 #include "ntdll.h"
+#include "winf.h"
 
 #define MAX_AT_FILE_LEVEL 10
 
@@ -77,292 +77,6 @@ do_global_ctors (void (**in_pfunc)(), int force)
     (*pfunc) ();
 }
 
-/*
- * Replaces @file in the command line with the contents of the file.
- * There may be multiple @file's in a single command line
- * A \@file is replaced with @file so that echo \@foo would print
- * @foo and not the contents of foo.
- */
-static bool __stdcall
-insert_file (char *name, char *&cmd)
-{
-  HANDLE f;
-  DWORD size;
-  tmp_pathbuf tp;
-
-  PWCHAR wname = tp.w_get ();
-  sys_mbstowcs (wname, NT_MAX_PATH, name + 1);
-  f = CreateFileW (wname,
-		   GENERIC_READ,		/* open for reading	*/
-		   FILE_SHARE_VALID_FLAGS,      /* share for reading	*/
-		   &sec_none_nih,		/* default security	*/
-		   OPEN_EXISTING,		/* existing file only	*/
-		   FILE_ATTRIBUTE_NORMAL,	/* normal file		*/
-		   NULL);			/* no attr. template	*/
-
-  if (f == INVALID_HANDLE_VALUE)
-    {
-      debug_printf ("couldn't open file '%s', %E", name);
-      return false;
-    }
-
-  /* This only supports files up to about 4 billion bytes in
-     size.  I am making the bold assumption that this is big
-     enough for this feature */
-  size = GetFileSize (f, NULL);
-  if (size == 0xFFFFFFFF)
-    {
-      CloseHandle (f);
-      debug_printf ("couldn't get file size for '%s', %E", name);
-      return false;
-    }
-
-  int new_size = strlen (cmd) + size + 2;
-  char *tmp = (char *) malloc (new_size);
-  if (!tmp)
-    {
-      CloseHandle (f);
-      debug_printf ("malloc failed, %E");
-      return false;
-    }
-
-  /* realloc passed as it should */
-  DWORD rf_read;
-  BOOL rf_result;
-  rf_result = ReadFile (f, tmp, size, &rf_read, NULL);
-  CloseHandle (f);
-  if (!rf_result || (rf_read != size))
-    {
-      free (tmp);
-      debug_printf ("ReadFile failed, %E");
-      return false;
-    }
-
-  tmp[size++] = ' ';
-  strcpy (tmp + size, cmd);
-  cmd = tmp;
-  return true;
-}
-
-static inline int
-isquote (char c)
-{
-  char ch = c;
-  return ch == '"' || ch == '\'';
-}
-
-/* Step over a run of characters delimited by quotes */
-static /*__inline*/ char *
-quoted (char *cmd, int winshell)
-{
-  char *p;
-  char quote = *cmd;
-
-  if (!winshell)
-    {
-      char *p;
-      strcpy (cmd, cmd + 1);
-      if (*(p = strchrnul (cmd, quote)))
-	strcpy (p, p + 1);
-      return p;
-    }
-
-  const char *s = quote == '\'' ? "'" : "\\\"";
-  /* This must have been run from a Windows shell, so preserve
-     quotes for globify to play with later. */
-  while (*cmd && *++cmd)
-    if ((p = strpbrk (cmd, s)) == NULL)
-      {
-	cmd = strchr (cmd, '\0');	// no closing quote
-	break;
-      }
-    else if (*p == '\\')
-      cmd = ++p;
-    else if (quote == '"' && p[1] == '"')
-      {
-	*p = '\\';
-	cmd = ++p;			// a quoted quote
-      }
-    else
-      {
-	cmd = p + 1;		// point to after end
-	break;
-      }
-  return cmd;
-}
-
-/* Perform a glob on word if it contains wildcard characters.
-   Also quote every character between quotes to force glob to
-   treat the characters literally. */
-
-/* Either X:[...] or \\server\[...] */
-#define is_dos_path(s) (isdrive(s) \
-			|| ((s)[0] == '\\' \
-			    && (s)[1] == '\\' \
-			    && isalpha ((s)[2]) \
-			    && strchr ((s) + 3, '\\')))
-
-static int __stdcall
-globify (char *word, char **&argv, int &argc, int &argvlen)
-{
-  if (*word != '~' && strpbrk (word, "?*[\"\'(){}") == NULL)
-    return 0;
-
-  int n = 0;
-  char *p, *s;
-  int dos_spec = is_dos_path (word);
-  if (!dos_spec && isquote (*word) && word[1] && word[2])
-    dos_spec = is_dos_path (word + 1);
-
-  /* We'll need more space if there are quoting characters in
-     word.  If that is the case, doubling the size of the
-     string should provide more than enough space. */
-  if (strpbrk (word, "'\""))
-    n = strlen (word);
-  char pattern[strlen (word) + ((dos_spec + 1) * n) + 1];
-
-  /* Fill pattern with characters from word, quoting any
-     characters found within quotes. */
-  for (p = pattern, s = word; *s != '\000'; s++, p++)
-    if (!isquote (*s))
-      {
-	if (dos_spec && *s == '\\')
-	  *p++ = '\\';
-	*p = *s;
-      }
-    else
-      {
-	char quote = *s;
-	while (*++s && *s != quote)
-	  {
-	    if (dos_spec || *s != '\\')
-	      /* nothing */;
-	    else if (s[1] == quote || s[1] == '\\')
-	      s++;
-	    *p++ = '\\';
-	    size_t cnt = isascii (*s) ? 1 : mbtowc (NULL, s, MB_CUR_MAX);
-	    if (cnt <= 1 || cnt == (size_t)-1)
-	      *p++ = *s;
-	    else
-	      {
-		--s;
-		while (cnt-- > 0)
-		  *p++ = *++s;
-	      }
-	  }
-	if (*s == quote)
-	  p--;
-	if (*s == '\0')
-	    break;
-      }
-
-  *p = '\0';
-
-  glob_t gl;
-  gl.gl_offs = 0;
-
-  /* Attempt to match the argument.  Return just word (minus quoting) if no match. */
-  if (glob (pattern, GLOB_TILDE | GLOB_NOCHECK | GLOB_BRACE | GLOB_QUOTE, NULL, &gl) || !gl.gl_pathc)
-    return 0;
-
-  /* Allocate enough space in argv for the matched filenames. */
-  n = argc;
-  if ((argc += gl.gl_pathc) > argvlen)
-    {
-      argvlen = argc + 10;
-      argv = (char **) realloc (argv, (1 + argvlen) * sizeof (argv[0]));
-    }
-
-  /* Copy the matched filenames to argv. */
-  char **gv = gl.gl_pathv;
-  char **av = argv + n;
-  while (*gv)
-    {
-      debug_printf ("argv[%d] = '%s'", n++, *gv);
-      *av++ = *gv++;
-    }
-
-  /* Clean up after glob. */
-  free (gl.gl_pathv);
-  return 1;
-}
-
-/* Build argv, argc from string passed from Windows.  */
-
-static void __stdcall
-build_argv (char *cmd, char **&argv, int &argc, int winshell)
-{
-  int argvlen = 0;
-  int nesting = 0;		// monitor "nesting" from insert_file
-
-  argc = 0;
-  argvlen = 0;
-  argv = NULL;
-
-  /* Scan command line until there is nothing left. */
-  while (*cmd)
-    {
-      /* Ignore spaces */
-      if (issep (*cmd))
-	{
-	  cmd++;
-	  continue;
-	}
-
-      /* Found the beginning of an argument. */
-      char *word = cmd;
-      char *sawquote = NULL;
-      while (*cmd)
-	{
-	  if (*cmd != '"' && (!winshell || *cmd != '\''))
-	    cmd++;		// Skip over this character
-	  else
-	    /* Skip over characters until the closing quote */
-	    {
-	      sawquote = cmd;
-	      /* Handle quoting.  Only strip off quotes if the parent is
-		 a Cygwin process, or if the word starts with a '@'.
-		 In this case, the insert_file function needs an unquoted
-		 DOS filename and globbing isn't performed anyway. */
-	      cmd = quoted (cmd, winshell && argc > 0 && *word != '@');
-	    }
-	  if (issep (*cmd))	// End of argument if space
-	    break;
-	}
-      if (*cmd)
-	*cmd++ = '\0';		// Terminate `word'
-
-      /* Possibly look for @file construction assuming that this isn't
-	 the very first argument and the @ wasn't quoted */
-      if (argc && sawquote != word && *word == '@')
-	{
-	  if (++nesting > MAX_AT_FILE_LEVEL)
-	    api_fatal ("Too many levels of nesting for %s", word);
-	  if (insert_file (word, cmd))
-	      continue;			// There's new stuff in cmd now
-	}
-
-      /* See if we need to allocate more space for argv */
-      if (argc >= argvlen)
-	{
-	  argvlen = argc + 10;
-	  argv = (char **) realloc (argv, (1 + argvlen) * sizeof (argv[0]));
-	}
-
-      /* Add word to argv file after (optional) wildcard expansion. */
-      if (!winshell || !argc || !globify (word, argv, argc, argvlen))
-	{
-	  debug_printf ("argv[%d] = '%s'", argc, word);
-	  argv[argc++] = word;
-	}
-    }
-
-  if (argv)
-    argv[argc] = NULL;
-
-  debug_printf ("argc %d", argc);
-}
-
 /* sanity and sync check */
 void __stdcall
 check_sanity_and_sync (per_process *p)
@@ -939,8 +653,13 @@ dll_crt0_1 (void *)
 
       /* Scan the command line and build argv.  Expand wildcards if not
 	 called from another cygwin process. */
-      build_argv (line, __argv, __argc,
-		  NOTSTATE (myself, PID_CYGPARENT) && allow_glob);
+      __argc = cygwin_cmdline_parse (line, &__argv, NULL,
+		NOTSTATE (myself, PID_CYGPARENT) && allow_glob, MAX_AT_FILE_LEVEL);
+
+      if (__argc == -EMLINK)
+	api_fatal ("Too many levels of nesting for %s", line);
+      else if (__argc < 0)
+	api_fatal ("%s for parsing cmdline %s", strerror(-__argc), line);
 
       /* Convert argv[0] to posix rules if it's currently blatantly
 	 win32 style. */
@@ -1064,7 +783,7 @@ _dll_crt0 ()
 	  if (stackaddr)
 	    {
 	      /* Set stack pointer to new address.  Set frame pointer to
-	         stack pointer and subtract 32 bytes for shadow space. */
+		 stack pointer and subtract 32 bytes for shadow space. */
 	      __asm__ ("\n\
 		       movq %[ADDR], %%rsp \n\
 		       movq  %%rsp, %%rbp  \n\
diff --git a/winsup/cygwin/regparm.h b/winsup/cygwin/regparm.h
index cce1bab4a..a14c501db 100644
--- a/winsup/cygwin/regparm.h
+++ b/winsup/cygwin/regparm.h
@@ -8,7 +8,7 @@ details. */
 
 #pragma once
 
-#if defined (__x86_64__) || defined (__CYGMAGIC__)
+#if defined (__x86_64__) || defined (__CYGMAGIC__) || !defined (__GNUC__)
 # define __reg1
 # define __reg2
 # define __reg3
diff --git a/winsup/cygwin/winf.cc b/winsup/cygwin/winf.cc
index d0c5c440f..b2926c8e6 100644
--- a/winsup/cygwin/winf.cc
+++ b/winsup/cygwin/winf.cc
@@ -15,15 +15,16 @@ details. */
 #include "tls_pbuf.h"
 #include "winf.h"
 #include "sys/cygwin.h"
+#include "glob.h"
 
 void
-linebuf::finish (bool cmdlenoverflow_ok)
+linebuf::finish (bool trunc_for_cygwin)
 {
   if (!ix)
     add ("", 1);
   else
     {
-      if (ix-- > MAXCYGWINCMDLEN && cmdlenoverflow_ok)
+      if (ix-- > MAXCYGWINCMDLEN && trunc_for_cygwin)
 	ix = MAXCYGWINCMDLEN - 1;
       buf[ix] = '\0';
     }
@@ -63,7 +64,7 @@ linebuf::prepend (const char *what, int len)
 }
 
 bool
-linebuf::fromargv (av& newargv, const char *real_path, bool cmdlenoverflow_ok)
+linebuf::fromargv (av& newargv, const char *real_path, bool trunc_for_cygwin, bool forcequote)
 {
   bool success = true;
   for (int i = 0; i < newargv.argc; i++)
@@ -73,7 +74,7 @@ linebuf::fromargv (av& newargv, const char *real_path, bool cmdlenoverflow_ok)
 
       a = i ? newargv[i] : (char *) real_path;
       int len = strlen (a);
-      if (len != 0 && !strpbrk (a, " \t\n\r\""))
+      if (len != 0 && !forcequote && !strpbrk (a, " \t\n\r\""))
 	add (a, len);
       else
 	{
@@ -111,7 +112,7 @@ linebuf::fromargv (av& newargv, const char *real_path, bool cmdlenoverflow_ok)
       add (" ", 1);
     }
 
-  finish (cmdlenoverflow_ok);
+  finish (trunc_for_cygwin);
 
   if (ix >= MAXWINCMDLEN)
     {
@@ -138,3 +139,339 @@ av::unshift (const char *what)
   argc++;
   return 1;
 }
+
+/*
+ * Read a file into a newly-allocated buffer.
+ */
+static char* __reg1
+read_file (const char *name)
+{
+  HANDLE f;
+  DWORD size;
+  tmp_pathbuf tp;
+
+  PWCHAR wname = tp.w_get ();
+  sys_mbstowcs (wname, NT_MAX_PATH, name);	/* FIXME: Do we need \\?\ ? */
+  f = CreateFileW (wname,
+		   GENERIC_READ,		/* open for reading	*/
+		   FILE_SHARE_VALID_FLAGS,      /* share for reading	*/
+		   &sec_none_nih,		/* default security	*/
+		   OPEN_EXISTING,		/* existing file only	*/
+		   FILE_ATTRIBUTE_NORMAL,	/* normal file		*/
+		   NULL);			/* no attr. template	*/
+
+  if (f == INVALID_HANDLE_VALUE)
+    {
+      debug_printf ("couldn't open file '%s', %E", name);
+      return NULL;
+    }
+
+  /* This only supports files up to about 4 billion bytes in
+     size.  I am making the bold assumption that this is big
+     enough for this feature */
+  size = GetFileSize (f, NULL);
+  if (size == 0xFFFFFFFF)
+    {
+      debug_printf ("couldn't get file size for '%s', %E", name);
+      CloseHandle (f);
+      return NULL;
+    }
+
+  char *string = (char *) malloc (size + 1);
+  if (!string)
+    {
+      CloseHandle (f);
+      return NULL;
+    }
+
+  /* malloc passed as it should */
+  DWORD rf_read;
+  BOOL rf_result;
+  rf_result = ReadFile (f, string, size, &rf_read, NULL);
+  if (!rf_result || (rf_read != size))
+    {
+      CloseHandle (f);
+      return NULL;
+    }
+  string[size] = '\0';
+  return string;
+}
+
+static inline int
+isquote (char c)
+{
+  char ch = c;
+  return ch == '"' || ch == '\'';
+}
+
+static inline int
+issep (char c)
+{
+  return c && (strchr (" \t\n\r", c) != NULL);
+}
+
+/* MSVCRT-like argument parsing.
+ * Parse a word in-place, consuming characters and marking where quoting state is changed. */
+static bool __reg3
+next_arg (char *&cmd, char *&arg, size_t* quotepos, size_t &quotesize)
+{
+  bool inquote = false;
+  size_t nbs = 0;
+  char quote = '\0';
+  size_t nquotes = 0;
+
+  while (*cmd && issep (*cmd))
+    cmd++;
+
+  arg = cmd;
+  char *out = arg;
+
+  for (;*cmd && (inquote || !issep (*cmd)); cmd++)
+    {
+      if (*cmd == '\\')
+	{
+	  nbs += 1;
+	  continue;
+	}
+
+      // For anything else, sort out backslashes first.
+      // All backslashes are literal, except these before a quote.
+      // Single-quote is our addition.  Would love to remove it.
+      bool atquote = inquote ? *cmd == quote : isquote (*cmd);
+      size_t n = atquote ? nbs / 2 : nbs;
+      memset (out, '\\', n);
+      out += n;
+
+      if (nbs % 2 == 0 && atquote)
+	{
+	  /* The infamous "" special case: emit literal '"', no change.
+	   *
+	   * Makes quotepos tracking easier, so applies to single quote too:
+	   * without this handling, an out pos can contain many state changes,
+	   * so a check must be done before appending. */
+	  if (inquote && *cmd == quote && cmd[1] == quote)
+	    *out++ = *cmd++;
+	  else
+	    {
+	      if (!inquote)
+		quote = *cmd;
+	      if (nquotes > quotesize - 1)
+		quotepos = realloc_type(quotepos, quotesize *= 2, size_t);
+	      quotepos[nquotes++] = out - arg + inquote;
+	      inquote = !inquote;
+	    }
+	}
+      else
+	{
+	  *out++ = *cmd;
+	}
+
+      nbs = 0;
+    }
+
+  if (*cmd)
+    cmd++;
+
+  *out = '\0';
+  quotepos[nquotes++] = SIZE_MAX;
+  return arg != cmd;
+}
+
+
+/* Either X:[...] or \\server\[...] */
+#define is_dos_path(s) (isdrive(s) \
+			|| ((s)[0] == '\\' \
+			    && (s)[1] == '\\' \
+			    && isalpha ((s)[2]) \
+			    && strchr ((s) + 3, '\\')))
+
+
+/* Perform a glob on word if it contains wildcard characters.
+   Also quote every character between quotes to force glob to
+   treat the characters literally.
+
+   Call glob(3) on the word, and fill argv accordingly.
+   If the input looks like a DOS path, double up backslashes.
+ */
+static int __reg3
+globify (const char *word, size_t *quotepos, size_t quotesize, char **&argv, int &argc, int &argvlen)
+{
+  if (*word != '~' && strpbrk (word, "?*[\"\'(){}") == NULL)
+    return 0;
+
+  /* We'll need more space if there are quoting characters in
+     word.  If that is the case, doubling the size of the
+     string should provide more than enough space. */
+  size_t n = quotepos[0] == SIZE_MAX ? 0 : strlen (word);
+  char *p;
+  const char *s;
+  int dos_spec = is_dos_path (word);
+  char pattern[strlen (word) + ((dos_spec + 1) * n) + 1];
+  bool inquote = false;
+  size_t nquotes = 0;
+
+  /* Fill pattern with characters from word, quoting any
+     characters found within quotes. */
+  for (p = pattern, s = word; *s != '\000'; s++, p++)
+    {
+      if (nquotes < quotesize)
+	{
+	  if (quotepos[nquotes] == SIZE_MAX)
+	    quotesize = nquotes;
+	  else if (quotepos[nquotes] == size_t (s - word))
+	    {
+	      inquote = !inquote;
+	      nquotes++;
+	    }
+	}
+      if (!inquote)
+	{
+	  if (dos_spec && *s == '\\')
+	    *p++ = '\\';
+	  *p = *s;
+	}
+      else
+	{
+	  *p++ = '\\';
+	  int cnt = isascii (*s) ? 1 : mbtowc (NULL, s, MB_CUR_MAX);
+	  if (cnt <= 1)
+	    *p = *s;
+	  else
+	    {
+	      memcpy (p, s, cnt);
+	      p += cnt - 1;
+	      s += cnt - 1;
+	    }
+	}
+    }
+  *p = '\0';
+
+  glob_t gl;
+  gl.gl_offs = 0;
+
+  /* Attempt to match the argument.  Bail if no match. */
+  if (glob (pattern, GLOB_TILDE | GLOB_BRACE, NULL, &gl) || !gl.gl_pathc)
+    return 0;
+
+  /* Allocate enough space in argv for the matched filenames. */
+  n = argc;
+  if ((argc += gl.gl_pathc) > argvlen)
+    {
+      argvlen = argc + 10;
+      char **old_argv = argv;
+      argv = (char **) realloc (argv, (1 + argvlen) * sizeof (argv[0]));
+      if (!argv)
+	{
+	  free (gl.gl_pathv);
+	  argv = old_argv;
+	  return -ENOMEM;
+	}
+    }
+
+  /* Copy the matched filenames to argv. */
+  char **gv = gl.gl_pathv;
+  char **av = argv + n;
+  while (*gv)
+    {
+      debug_printf ("argv[%zu] = '%s'", n++, *gv);
+      *av++ = *gv++;
+    }
+
+  /* Clean up after glob. */
+  free (gl.gl_pathv);
+  return 1;
+}
+
+/* Build argv, argc from string passed from Windows. */
+extern "C" int
+cygwin_cmdline_parse (char *cmd, char ***pargv, char **allocs, int doglob, int maxfile)
+{
+  int argvlen = 0;
+  int nesting = 0;	      // nesting depth for @file (file_stack index)
+  int inserts = 0;	      // total @file encountered (allocs index)
+
+  // Would be a bad idea to use alloca due to unbounded file size.
+  size_t quotesize = 32;
+  size_t *quotepos = malloc_type (quotesize, size_t);
+
+  int argc = 0;
+  int error = 0;
+  char **argv = NULL;
+  char *word;
+
+  bool bail_on_file = maxfile > 0;
+  maxfile = bail_on_file ? maxfile : -maxfile;
+  char *file_stack[maxfile];
+  bool has_next;
+
+  /* Scan command line until there is nothing left. */
+  while ((has_next = next_arg (cmd, word, quotepos, quotesize)) || nesting)
+    {
+      /* Catch when "nothing" is reached but we can pop the stack. */
+      if (! has_next)
+	{
+	  cmd = file_stack[--nesting];
+	  continue;
+	}
+
+      /* Possibly look for @file construction assuming that this isn't
+	 the very first argument and the @ wasn't quoted */
+      if (argc && quotepos[0] > 0 && *word == '@')
+	{
+	  bool do_include = inserts < maxfile;
+	  char *fbuf;
+	  if (!do_include && bail_on_file)
+	    {
+	      error = -EMLINK;
+	      goto exit;
+	    }
+	  if (do_include && (fbuf = read_file (word + 1)))
+	    {
+	      file_stack[nesting++] = cmd;
+	      if (allocs != NULL)
+		allocs[inserts] = fbuf;
+
+	      inserts += 1;
+	      cmd = fbuf;
+	      continue;
+	    }
+	}
+
+      /* See if we need to allocate more space for argv */
+      if (argc >= argvlen)
+	{
+	  argvlen = argc + 10;
+	  char **old_argv = argv;
+	  argv = (char **) realloc (old_argv, (1 + argvlen) * sizeof (argv[0]));
+	  if (!argv)
+	    {
+	      argv = old_argv;
+	      error = -ENOMEM;
+	      goto exit;
+	    }
+	}
+
+      /* Add word to argv file after (optional) wildcard expansion. */
+      if (!doglob || !argc || (error = globify (word, quotepos, quotesize, argv, argc, argvlen)) > 0)
+	{
+	  debug_printf ("argv[%d] = '%s'", argc, word);
+	  argv[argc++] = word;
+	}
+      else if (error)
+	{
+	  goto exit;
+	}
+    }
+
+exit:
+  if (argv)
+    argv[argc] = NULL;
+  if (allocs != NULL)
+    allocs[inserts] = NULL;
+
+  free (quotepos);
+  debug_printf ("argc %d", argc);
+
+  *pargv = argv;
+  return error ? error : argc;
+}
diff --git a/winsup/cygwin/winf.h b/winsup/cygwin/winf.h
index e3a65f8cc..24aef9bf8 100644
--- a/winsup/cygwin/winf.h
+++ b/winsup/cygwin/winf.h
@@ -94,3 +94,5 @@ class linebuf
     return wbuf;
   }
 };
+
+extern "C" int cygwin_cmdline_parse (char *, char ***, char **, int, int);
diff --git a/winsup/cygwin/winsup.h b/winsup/cygwin/winsup.h
index fff7d18f3..3d779c742 100644
--- a/winsup/cygwin/winsup.h
+++ b/winsup/cygwin/winsup.h
@@ -133,9 +133,6 @@ extern int cygserver_running;
 
 #define set_api_fatal_return(n) do {extern int __api_fatal_exit_val; __api_fatal_exit_val = (n);} while (0)
 
-#undef issep
-#define issep(ch) (strchr (" \t\n\r", (ch)) != NULL)
-
 /* Every path beginning with / or \, as well as every path being X:
    or starting with X:/ or X:\ */
 #define isabspath_u(p) \
@@ -151,6 +148,10 @@ extern int cygserver_running;
 #define isabspath(p) \
   (isdirsep (*(p)) || (isalpha (*(p)) && (p)[1] == ':' && (!(p)[2] || isdirsep ((p)[2]))))
 
+/* Shortcut.  See also std::add_pointer. */
+#define malloc_type(n, type) ((type *) malloc ((n) * sizeof (type)))
+#define realloc_type(b, n, type) ((type *) realloc ((b), (n) * sizeof (type)))
+
 /******************** Initialization/Termination **********************/
 
 class per_process;
diff --git a/winsup/doc/cygwinenv.xml b/winsup/doc/cygwinenv.xml
index f549fee0d..54ee93443 100644
--- a/winsup/doc/cygwinenv.xml
+++ b/winsup/doc/cygwinenv.xml
@@ -34,10 +34,10 @@ There is no default set.
 </listitem>
 
 <listitem>
-<para><envar>(no)glob[:ignorecase]</envar> - if set, command line arguments
-containing UNIX-style file wildcard characters (brackets, braces, question mark,
-asterisk, escaped with \) are expanded into lists of files that match 
-those wildcards.
+<para><envar>(no)glob[:ignorecase]</envar> - if set, unquoted
+command line arguments containing UNIX-style file wildcard characters (brackets,
+braces, question mark, asterisk, escaped with \) are expanded into lists of
+files that match those wildcards.  Leading tildes are expanded.
 This is applicable only to programs run from non-Cygwin programs such as a CMD prompt.
 That means that this setting does not affect globbing operations for shells such as
 bash, sh, tcsh, zsh, etc.
diff --git a/winsup/doc/faq-api.xml b/winsup/doc/faq-api.xml
index 313f15d37..f09dd0b11 100644
--- a/winsup/doc/faq-api.xml
+++ b/winsup/doc/faq-api.xml
@@ -169,7 +169,7 @@ for the old executable and any dll into per-user subdirectories in the
 <para>If the DLL thinks it was invoked from a DOS style prompt, it runs a
 `globber' over the arguments provided on the command line.  This means
 that if you type <literal>LS *.EXE</literal> from DOS, it will do what you might
-expect.
+expect.  This only happens to the unquoted parts.
 </para>
 <para>Beware: globbing uses <literal>malloc</literal>.  If your application defines
 <literal>malloc</literal>, that will get used.  This may do horrible things to you.
-- 
2.20.1.windows.1

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-11-09 10:04 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-06-24 22:35 [PATCH] Cygwin: rewrite cmdline parser Mingye Wang
2020-06-25 14:43 ` [PATCH v2] " Mingye Wang
2020-06-25 14:49   ` Mingye Wang
2020-06-30 10:30   ` Corinna Vinschen
2020-09-05  5:27 [PATCH v4 3/3] testsuite: don't strip dir from obj files Mingye Wang
2020-11-07 12:12 ` [PATCH v5] Cygwin: rewrite cmdline parser Mingye Wang
2020-11-07 12:12   ` [PATCH] " Mingye Wang
2020-11-09 10:04     ` Corinna Vinschen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).