public inbox for binutils@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] gas: rework handling of backslashes in quoted symbol names
@ 2021-12-17 12:44 Jan Beulich
  0 siblings, 0 replies; only message in thread
From: Jan Beulich @ 2021-12-17 12:44 UTC (permalink / raw)
  To: Binutils

Strange effects can result from the present handling, e.g.:

.if 1
"backslash\\":
.endif

yields first (correctly) "missing closing `"'" but then also "invalid
character '\' in mnemonic" and further "end of file inside conditional".
Symbols names ending in \ are in principle not expressable with that
scheme.

Instead of recording whether a backslash was seen, inspect the
subsequent character right away. Only accept \\ (meaning a single
backslash in the resulting symbol name) and \" (meaning an embedded
double quote in the resulting symbol name) for now, warning about any
other combination.

While perhaps not necessary immediately, also permit concatenated
strings to form a symbol name. This may become useful if going forward
we would want to support \<octal> or \x<hex> sequences, where closing
and re-opening quotes can be useful to delimit such sequences.

The ELF "Multibyte symbol names" test gets switched away from using
.set, as that would now also mean excluding nios2 and pru. By using
.equiv instead, even the existing #notarget can be dropped. (For h8300
the .section directive additionally needs attributes specified, to avoid
a target specific warning.)
---
Nick gave his approval already a while back, but I thought I'd better
re-post with the testsuite fallout addressed before committing.
---
I'm actually wondering whether the storing of 0 is really necessary when
we did _not_ find a symbol name.

But perhaps a more fundamental question is whether altering the input
buffer is okay; the earlier ia64 patch was primarily added because of
this (albeit I think it has its own merit). After all besides
get_symbol_name() there also is read_symbol_name(), a comment of which
says that it allocates a buffer because of escape character handling. I
have no idea why there are two functions for apparently the same purpose
in the first place, and I question whether quoted symbol names and
operands to e.g. .ascii should really be dealt with by the same
underlying string handling. In any event this is the reason why one of
the .globl directives in the testcase additions needed commenting out,
limiting the usefulness of that part of the test.

--- a/gas/expr.c
+++ b/gas/expr.c
@@ -2395,18 +2395,52 @@ get_symbol_name (char ** ilp_return)
     }
   else if (c == '"')
     {
-      bool backslash_seen;
+      char *dst = input_line_pointer;
 
       * ilp_return = input_line_pointer;
-      do
+      for (;;)
 	{
-	  backslash_seen = c == '\\';
-	  c = * input_line_pointer ++;
-	}
-      while (c != 0 && (c != '"' || backslash_seen));
+	  c = *input_line_pointer++;
+
+	  if (c == 0)
+	    {
+	      as_warn (_("missing closing '\"'"));
+	      break;
+	    }
+
+	  if (c == '"')
+	    {
+	      char *ilp_save = input_line_pointer;
+
+	      SKIP_WHITESPACE ();
+	      if (*input_line_pointer == '"')
+		{
+		  ++input_line_pointer;
+		  continue;
+		}
+	      input_line_pointer = ilp_save;
+	      break;
+	    }
 
-      if (c == 0)
-	as_warn (_("missing closing '\"'"));
+	  if (c == '\\')
+	    switch (*input_line_pointer)
+	      {
+	      case '"':
+	      case '\\':
+		c = *input_line_pointer++;
+		break;
+
+	      default:
+		if (c != 0)
+		  as_warn (_("'\\%c' in quoted symbol name; "
+			     "behavior may change in the future"),
+			   *input_line_pointer);
+		break;
+	      }
+
+	  *dst++ = c;
+	}
+      *dst = 0;
     }
   *--input_line_pointer = 0;
   return c;
--- a/gas/testsuite/gas/all/quoted-sym-names.d
+++ b/gas/testsuite/gas/all/quoted-sym-names.d
@@ -1,6 +1,13 @@
-#nm: --extern-only
+#nm: --extern-only --numeric-sort
 #name: quoted symbol names
+# No quoted strings handling (TC_STRING_ESCAPES set to 0):
+#notarget: powerpc*-*-aix* powerpc*-*-beos* powerpc-*-macos* rs6000-*-*
+# Explicitly no escapes in quoted strings:
+#notarget: z80-*-*
 
 #...
 0+00 T test-a
-
+0+01 T back\\slash
+0+02 T back"slash
+0+03 T backslash\\
+0+04 T backslash"
--- a/gas/testsuite/gas/all/quoted-sym-names.s
+++ b/gas/testsuite/gas/all/quoted-sym-names.s
@@ -1,4 +1,19 @@
 	.text
 	.globl	"test-a"
 "test-a":
-	.word 0
+	.byte 0
+	.globl	"back\\slash"
+"back\\slash":
+	.byte 0
+	.globl	"back\"slash"
+"back\"slash":
+	.byte 0
+	.globl	"backslash\\"
+"backslash\\":
+	.byte 0
+	.globl	"backslash\""
+"backslash\"":
+	.byte 0
+/*	.globl	"back""slash" */
+"back""slash":
+	.byte 0
--- a/gas/testsuite/gas/elf/syms.d
+++ b/gas/testsuite/gas/elf/syms.d
@@ -1,7 +1,5 @@
 #readelf: -S -s -p .strtab
 #name: Multibyte symbol names
-# The following targets use an unusual .set syntax...
-#notarget: alpha*-*-* h8300-*-*
 
 #...
 Section Headers:
--- a/gas/testsuite/gas/elf/syms.s
+++ b/gas/testsuite/gas/elf/syms.s
@@ -1,5 +1,5 @@
-	.section "sec\xa5\xc2tion"
+	.section "sec\xa5\xc2tion", "a"
 	
-	.set "sy\xa5\xc2mbol", .
+	.equiv "sy\xa5\xc2mbol", .
 
 	.string8 "str\xa5\xc2ing"


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2021-12-17 12:45 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-17 12:44 [PATCH] gas: rework handling of backslashes in quoted symbol names Jan Beulich

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).