public inbox for cygwin-patches@cygwin.com
 help / color / mirror / Atom feed
From: Mingye Wang <arthur2e5@aosc.io>
To: cygwin-patches@cygwin.com
Cc: Mingye Wang <arthur2e5@aosc.io>
Subject: [PATCH] Cygwin: rewrite cmdline parser
Date: Thu, 25 Jun 2020 06:35:53 +0800	[thread overview]
Message-ID: <20200624223553.8892-1-arthur2e5@aosc.io> (raw)

This commit rewrites the cmdline parser to achieve the following:
* MSVCRT compatibility. Except for the single-quote handling (an
  extension for compatibility with old Cygwin), the parser now
  interprets option boundaries exactly like MSVCR since 2008. This fixes
  the issue where our escaping does not work with our own parsing.
* Clarity. Since globify() is no longer responsible for handling the
  opening and closing of quotes, the code is much simpler.
* Sanity. The GLOB_NOCHECK flag is removed, so a failed glob correctly
  returns the literal value. Without the change, anything path-like
  would be garbled by globify's escaping.

The change fixes two complaints of mine:
* That cygwin is incompatible with its own escape.[1]
* That there is no way to echo `C:\"` from win32.[2]
  [1]: https://cygwin.com/pipermail/cygwin/2020-June/245162.html
  [2]: https://cygwin.com/pipermail/cygwin/2019-October/242790.html

(It's never the point to spawn cygwin32 from cygwin64. Consistency
matters: with yourself always, and with the outside world when you are
supposed to.)
---
 winsup/cygwin/dcrt0.cc | 192 +++++++++++++++++++----------------------
 1 file changed, 88 insertions(+), 104 deletions(-)

diff --git a/winsup/cygwin/dcrt0.cc b/winsup/cygwin/dcrt0.cc
index 5d8b4b74e..71fdda294 100644
--- a/winsup/cygwin/dcrt0.cc
+++ b/winsup/cygwin/dcrt0.cc
@@ -151,43 +151,52 @@ isquote (char c)
   return ch == '"' || ch == '\'';
 }
 
-/* Step over a run of characters delimited by quotes */
+
+/* MSVCRT-like argument parsing.
+ * Parse a word, consuming characters and marking where quoting state is changed. */
 static /*__inline*/ char *
-quoted (char *cmd, int winshell)
+next_arg (char *cmd, char *arg, size_t* quotepos)
 {
-  char *p;
-  char quote = *cmd;
+  int inquote = 0;
+  int nbs = 0;
+  char *start = cmd;
+  char quote = '\0';
 
-  if (!winshell)
+  while (*cmd && (inquote || !issep(*cmd)))
     {
-      char *p;
-      strcpy (cmd, cmd + 1);
-      if (*(p = strchrnul (cmd, quote)))
-	strcpy (p, p + 1);
-      return p;
-    }
+      if (*cmd == '\\')
+	{
+	  nbs += 1;
+	  continue;
+	}
 
-  const char *s = quote == '\'' ? "'" : "\\\"";
-  /* This must have been run from a Windows shell, so preserve
-     quotes for globify to play with later. */
-  while (*cmd && *++cmd)
-    if ((p = strpbrk (cmd, s)) == NULL)
-      {
-	cmd = strchr (cmd, '\0');	// no closing quote
-	break;
-      }
-    else if (*p == '\\')
-      cmd = ++p;
-    else if (quote == '"' && p[1] == '"')
-      {
-	*p = '\\';
-	cmd = ++p;			// a quoted quote
-      }
-    else
-      {
-	cmd = p + 1;		// point to after end
-	break;
-      }
+      // For anything else, sort out backslashes first.
+      memset(arg, '\\', nbs / 2);
+      arg += nbs / 2;
+
+      // Single-quote is our addition.
+      if (nbs % 2 == 0 && (inquote ? *cmd == quote : isquote(*cmd)))
+	{
+	  // The infamous "" special case: emit literal '"', no change.
+	  if (inquote && *cmd == '"' && cmd[1] == '"')
+	    *arg++ = *cmd++;
+	  else
+	    {
+	      if (!inquote)
+		quote = *cmd;
+	      inquote = !inquote;
+	      *quotepos++ = cmd - start;
+	    }
+	}
+      else
+	{
+	  *arg++ = *cmd;
+	}
+
+      nbs = 0;
+      cmd++;
+    }
+  *arg = '\0';
   return cmd;
 }
 
@@ -202,67 +211,60 @@ quoted (char *cmd, int winshell)
 			    && isalpha ((s)[2]) \
 			    && strchr ((s) + 3, '\\')))
 
+/* Call glob(3) on the word, and fill argv accordingly.
+ * If the input looks like a DOS path, double up backslashes.
+ */
 static int __stdcall
-globify (char *word, char **&argv, int &argc, int &argvlen)
+globify (char *word, size_t *quotepos, char **&argv, int &argc, int &argvlen)
 {
   if (*word != '~' && strpbrk (word, "?*[\"\'(){}") == NULL)
     return 0;
 
-  int n = 0;
-  char *p, *s;
-  int dos_spec = is_dos_path (word);
-  if (!dos_spec && isquote (*word) && word[1] && word[2])
-    dos_spec = is_dos_path (word + 1);
-
   /* We'll need more space if there are quoting characters in
      word.  If that is the case, doubling the size of the
      string should provide more than enough space. */
-  if (strpbrk (word, "'\""))
-    n = strlen (word);
+  int n = quotepos[0] == SIZE_MAX ? 0 : strlen(word);
+  char *p, *s;
+  int dos_spec = is_dos_path (word);
   char pattern[strlen (word) + ((dos_spec + 1) * n) + 1];
+  int inquote = 0;
 
   /* Fill pattern with characters from word, quoting any
      characters found within quotes. */
   for (p = pattern, s = word; *s != '\000'; s++, p++)
-    if (!isquote (*s))
-      {
-	if (dos_spec && *s == '\\')
-	  *p++ = '\\';
-	*p = *s;
-      }
-    else
-      {
-	char quote = *s;
-	while (*++s && *s != quote)
-	  {
-	    if (dos_spec || *s != '\\')
-	      /* nothing */;
-	    else if (s[1] == quote || s[1] == '\\')
-	      s++;
+    {
+      if (*quotepos == s - word)
+	{
+	  inquote = !inquote;
+	  quotepos++;
+	}
+      if (!inquote)
+	{
+	  if (dos_spec && *s == '\\')
 	    *p++ = '\\';
-	    size_t cnt = isascii (*s) ? 1 : mbtowc (NULL, s, MB_CUR_MAX);
-	    if (cnt <= 1 || cnt == (size_t)-1)
-	      *p++ = *s;
-	    else
-	      {
-		--s;
-		while (cnt-- > 0)
-		  *p++ = *++s;
-	      }
-	  }
-	if (*s == quote)
-	  p--;
-	if (*s == '\0')
-	    break;
-      }
-
+	  *p = *s;
+	}
+      else
+	{
+	  *p++ = '\\';
+	  int cnt = isascii (*s) ? 1 : mbtowc (NULL, s, MB_CUR_MAX);
+	  if (cnt <= 1)
+	    *p = *s;
+	  else
+	    {
+	      memcpy(p, s, cnt);
+	      p += cnt - 1;
+	      s += cnt - 1;
+	    }
+	}
+    }
   *p = '\0';
 
   glob_t gl;
   gl.gl_offs = 0;
 
   /* Attempt to match the argument.  Return just word (minus quoting) if no match. */
-  if (glob (pattern, GLOB_TILDE | GLOB_NOCHECK | GLOB_BRACE | GLOB_QUOTE, NULL, &gl) || !gl.gl_pathc)
+  if (glob (pattern, GLOB_TILDE | GLOB_BRACE | GLOB_QUOTE, NULL, &gl) || !gl.gl_pathc)
     return 0;
 
   /* Allocate enough space in argv for the matched filenames. */
@@ -288,12 +290,14 @@ globify (char *word, char **&argv, int &argc, int &argvlen)
 }
 
 /* Build argv, argc from string passed from Windows.  */
-
 static void __stdcall
-build_argv (char *cmd, char **&argv, int &argc, int winshell)
+build_argv (char *cmd, char **&argv, int &argc, int doglob)
 {
   int argvlen = 0;
   int nesting = 0;		// monitor "nesting" from insert_file
+  char *word = (char *) malloc(strlen(cmd) + 1);
+  // Locations for quote transition. We can trim this down to 32767 later.
+  size_t *quotepos = (size_t *) malloc(strlen(cmd) * sizeof(size_t));
 
   argc = 0;
   argvlen = 0;
@@ -302,39 +306,17 @@ build_argv (char *cmd, char **&argv, int &argc, int winshell)
   /* Scan command line until there is nothing left. */
   while (*cmd)
     {
-      /* Ignore spaces */
-      if (issep (*cmd))
+      while (issep(*cmd))
 	{
 	  cmd++;
 	  continue;
 	}
-
-      /* Found the beginning of an argument. */
-      char *word = cmd;
-      char *sawquote = NULL;
-      while (*cmd)
-	{
-	  if (*cmd != '"' && (!winshell || *cmd != '\''))
-	    cmd++;		// Skip over this character
-	  else
-	    /* Skip over characters until the closing quote */
-	    {
-	      sawquote = cmd;
-	      /* Handle quoting.  Only strip off quotes if the parent is
-		 a Cygwin process, or if the word starts with a '@'.
-		 In this case, the insert_file function needs an unquoted
-		 DOS filename and globbing isn't performed anyway. */
-	      cmd = quoted (cmd, winshell && argc > 0 && *word != '@');
-	    }
-	  if (issep (*cmd))	// End of argument if space
-	    break;
-	}
-      if (*cmd)
-	*cmd++ = '\0';		// Terminate `word'
+      quotepos[0] = SIZE_MAX;
+      cmd = next_arg (cmd, word, quotepos);
 
       /* Possibly look for @file construction assuming that this isn't
 	 the very first argument and the @ wasn't quoted */
-      if (argc && sawquote != word && *word == '@')
+      if (argc && quotepos[0] > 1 && *word == '@')
 	{
 	  if (++nesting > MAX_AT_FILE_LEVEL)
 	    api_fatal ("Too many levels of nesting for %s", word);
@@ -350,16 +332,18 @@ build_argv (char *cmd, char **&argv, int &argc, int winshell)
 	}
 
       /* Add word to argv file after (optional) wildcard expansion. */
-      if (!winshell || !argc || !globify (word, argv, argc, argvlen))
+      if (!doglob || !argc || !globify (word, quotepos, argv, argc, argvlen))
 	{
 	  debug_printf ("argv[%d] = '%s'", argc, word);
-	  argv[argc++] = word;
+	  argv[argc++] = strdup(word);
 	}
     }
 
   if (argv)
     argv[argc] = NULL;
 
+  free(word);
+  free(quotepos);
   debug_printf ("argc %d", argc);
 }
 
@@ -1064,7 +1048,7 @@ _dll_crt0 ()
 	  if (stackaddr)
 	    {
 	      /* Set stack pointer to new address.  Set frame pointer to
-	         stack pointer and subtract 32 bytes for shadow space. */
+		 stack pointer and subtract 32 bytes for shadow space. */
 	      __asm__ ("\n\
 		       movq %[ADDR], %%rsp \n\
 		       movq  %%rsp, %%rbp  \n\
-- 
2.20.1.windows.1

             reply	other threads:[~2020-06-24 22:36 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-06-24 22:35 Mingye Wang [this message]
2020-06-25 14:43 ` [PATCH v2] " Mingye Wang
2020-06-25 14:49   ` Mingye Wang
2020-06-30 10:30   ` Corinna Vinschen
2020-09-05  5:27 [PATCH v4 3/3] testsuite: don't strip dir from obj files Mingye Wang
2020-11-07 12:12 ` [PATCH v5] Cygwin: rewrite cmdline parser Mingye Wang
2020-11-07 12:12   ` [PATCH] " Mingye Wang
2020-11-09 10:04     ` Corinna Vinschen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200624223553.8892-1-arthur2e5@aosc.io \
    --to=arthur2e5@aosc.io \
    --cc=cygwin-patches@cygwin.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).