public inbox for gdb-patches@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] gdb: add UTF16/UTF32 target charsets in phony_iconv
@ 2022-10-02 14:00 Patrick Monnerat
  2022-10-07 20:10 ` Tom Tromey
  0 siblings, 1 reply; 9+ messages in thread
From: Patrick Monnerat @ 2022-10-02 14:00 UTC (permalink / raw)
  To: gdb-patches

Function phony_iconv is substituted to the system-supplied iconv on
platforms where the latter is deficient. It implements too few possible
conversions for the current gdb requirements. In particular, Ada support
in gdb needs converting strings to UTF-32, which is not currently
featured: as this is used to determine the language, a warning is
issued in all cases.

Conditonal statements decide when the substitution occurs. This
currently enables it for mingw (wchar_t is not UTF-32) even when the
system-supplied iconv is suitable for gdb use.

This patch extends phony_iconv_open and phony_iconv functions to support
any conversion from/to host encoding, wchar_t, UTF-16 and UTF-32 with
endianness alternatives.

The value returned by phony_iconv_open is an integer token representing
the size and endianness of both character encodings involved.
---
 gdb/charset.c | 132 +++++++++++++++++++++++++++++---------------------
 1 file changed, 76 insertions(+), 56 deletions(-)

diff --git a/gdb/charset.c b/gdb/charset.c
index a6261fc505c..c0528bfb9b5 100644
--- a/gdb/charset.c
+++ b/gdb/charset.c
@@ -98,27 +98,63 @@
 #undef ICONV_CONST
 #define ICONV_CONST const
 
-/* We allow conversions from UTF-32, wchar_t, and the host charset.
-   We allow conversions to wchar_t and the host charset.
-   Return 1 if we are converting from UTF-32BE, 2 if from UTF32-LE,
-   0 otherwise.  This is used as a flag in calls to iconv.  */
+/* We allow conversions from/to UTF-16, UTF-32, wchar_t, and the host charset.
+   Return a token representing the conversion or -1 if error.  The token
+   is unpacked in iconv.  */
 
-static iconv_t
-phony_iconv_open (const char *to, const char *from)
+#define TOKEN_BITS	3
+#define TOKEN_MASK	((1 << TOKEN_BITS) - 1)
+#define TOKEN_BIGENDIAN	(1 << (TOKEN_BITS - 1))
+
+static int
+phony_iconv_token (const char *encoding)
 {
-  if (strcmp (to, "wchar_t") && strcmp (to, GDB_DEFAULT_HOST_CHARSET))
-    return -1;
+  static struct
+    {
+      const char *name;
+      int token;
+    } const encodings[] =
+    {
+      { "UTF-16",   (2 - 1) | TOKEN_BIGENDIAN },
+      { "UTF-16BE", (2 - 1) | TOKEN_BIGENDIAN },
+      { "UTF-16LE", (2 - 1) },
+      { "UTF-32",   (4 - 1) | TOKEN_BIGENDIAN },
+      { "UTF-32BE", (4 - 1) | TOKEN_BIGENDIAN },
+      { "UTF-32LE", (4 - 1) },
+      { GDB_DEFAULT_HOST_CHARSET, (1 - 1) },
+#if WORDS_BIGENDIAN
+      {	"wchar_t", (sizeof (gdb_wchar_t) - 1) | TOKEN_BIGENDIAN },
+#else
+      {	"wchar_t",  (sizeof (gdb_wchar_t) - 1) },
+#endif
+      { NULL, -1 }
+    };
 
-  if (!strcmp (from, "UTF-32BE") || !strcmp (from, "UTF-32"))
-    return 1;
+  for (auto p = encodings; p->name; p++)
+    if (strcmp (encoding, p->name) == 0)
+      return p->token;
 
-  if (!strcmp (from, "UTF-32LE"))
-    return 2;
+  return -1;
+}
 
-  if (strcmp (from, "wchar_t") && strcmp (from, GDB_DEFAULT_HOST_CHARSET))
+static void
+phony_split_token (int token, size_t &size, enum bfd_endian &endian)
+{
+  /* Extract parameter values from the token. */
+  endian = token & TOKEN_BIGENDIAN ? BFD_ENDIAN_BIG : BFD_ENDIAN_LITTLE;
+  size = (token & TOKEN_MASK & ~TOKEN_BIGENDIAN) + 1;
+}
+
+static iconv_t
+phony_iconv_open (const char *to, const char *from)
+{
+  int totok = phony_iconv_token (to);
+  int fromtok = phony_iconv_token (from);
+
+  if (totok < 0 || fromtok < 0)
     return -1;
 
-  return 0;
+  return (totok << TOKEN_BITS) | fromtok;
 }
 
 static int
@@ -128,60 +164,44 @@ phony_iconv_close (iconv_t arg)
 }
 
 static size_t
-phony_iconv (iconv_t utf_flag, const char **inbuf, size_t *inbytesleft,
+phony_iconv (iconv_t token, const char **inbuf, size_t *inbytesleft,
 	     char **outbuf, size_t *outbytesleft)
 {
-  if (utf_flag)
+  enum bfd_endian toendian, fromendian;
+  size_t tosize, fromsize;
+  unsigned long maxval;
+
+  if(token & ~((TOKEN_MASK << TOKEN_BITS) | TOKEN_MASK))
     {
-      enum bfd_endian endian
-	= utf_flag == 1 ? BFD_ENDIAN_BIG : BFD_ENDIAN_LITTLE;
-      while (*inbytesleft >= 4)
-	{
-	  unsigned long c
-	    = extract_unsigned_integer ((const gdb_byte *)*inbuf, 4, endian);
+      errno = EBADF;
+      return (size_t) -1;
+    }
 
-	  if (c >= 256)
-	    {
-	      errno = EILSEQ;
-	      return -1;
-	    }
-	  if (*outbytesleft < 1)
-	    {
-	      errno = E2BIG;
-	      return -1;
-	    }
-	  **outbuf = c & 0xff;
-	  ++*outbuf;
-	  --*outbytesleft;
+  phony_split_token (token, fromsize, fromendian);
+  phony_split_token (token >> TOKEN_BITS, tosize, toendian);
+  maxval = 1UL << (7 * tosize);	/* Split shift to avoid count overflow. */
+  maxval = (maxval << tosize) - 1;
 
-	  *inbuf += 4;
-	  *inbytesleft -= 4;
-	}
-      if (*inbytesleft)
+  while (*inbytesleft >= fromsize)
+    {
+      unsigned long c = extract_unsigned_integer ((const gdb_byte *) *inbuf,
+                                                  fromsize, fromendian);
+
+      if (c > maxval)
 	{
-	  /* Partial sequence on input.  */
-	  errno = EINVAL;
+	  errno = EILSEQ;
 	  return -1;
 	}
-    }
-  else
-    {
-      /* In all other cases we simply copy input bytes to the
-	 output.  */
-      size_t amt = *inbytesleft;
-
-      if (amt > *outbytesleft)
-	amt = *outbytesleft;
-      memcpy (*outbuf, *inbuf, amt);
-      *inbuf += amt;
-      *outbuf += amt;
-      *inbytesleft -= amt;
-      *outbytesleft -= amt;
-      if (*inbytesleft)
+      if (*outbytesleft < tosize)
 	{
 	  errno = E2BIG;
 	  return -1;
 	}
+      store_unsigned_integer ((gdb_byte *) *outbuf, tosize, toendian, c);
+      *inbuf += fromsize;
+      *inbytesleft -= fromsize;
+      *outbuf += tosize;
+      *outbytesleft -= tosize;
     }
 
   /* The number of non-reversible conversions -- but they were all
-- 
2.37.3


^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2022-10-17 23:11 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-02 14:00 [PATCH] gdb: add UTF16/UTF32 target charsets in phony_iconv Patrick Monnerat
2022-10-07 20:10 ` Tom Tromey
2022-10-08  0:12   ` Patrick Monnerat
2022-10-08 18:55     ` Tom Tromey
2022-10-09  0:47       ` Patrick Monnerat
2022-10-10 16:11         ` Tom Tromey
2022-10-16  1:50           ` Tom Tromey
2022-10-16  6:24             ` Eli Zaretskii
2022-10-17 23:10               ` Tom Tromey

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).