[PATCH] gdb: add UTF16/UTF32 target charsets in phony_iconv

public inbox for gdb-patches@sourceware.org
 help / color / mirror / Atom feed

From: Patrick Monnerat <patrick@monnerat.net>
To: gdb-patches@sourceware.org
Subject: [PATCH] gdb: add UTF16/UTF32 target charsets in phony_iconv
Date: Sun,  2 Oct 2022 16:00:10 +0200	[thread overview]
Message-ID: <20221002140010.106238-1-patrick@monnerat.net> (raw)

Function phony_iconv is substituted to the system-supplied iconv on
platforms where the latter is deficient. It implements too few possible
conversions for the current gdb requirements. In particular, Ada support
in gdb needs converting strings to UTF-32, which is not currently
featured: as this is used to determine the language, a warning is
issued in all cases.

Conditonal statements decide when the substitution occurs. This
currently enables it for mingw (wchar_t is not UTF-32) even when the
system-supplied iconv is suitable for gdb use.

This patch extends phony_iconv_open and phony_iconv functions to support
any conversion from/to host encoding, wchar_t, UTF-16 and UTF-32 with
endianness alternatives.

The value returned by phony_iconv_open is an integer token representing
the size and endianness of both character encodings involved.
---
 gdb/charset.c | 132 +++++++++++++++++++++++++++++---------------------
 1 file changed, 76 insertions(+), 56 deletions(-)

diff --git a/gdb/charset.c b/gdb/charset.c
index a6261fc505c..c0528bfb9b5 100644
--- a/gdb/charset.c
+++ b/gdb/charset.c
@@ -98,27 +98,63 @@
 #undef ICONV_CONST
 #define ICONV_CONST const
 
-/* We allow conversions from UTF-32, wchar_t, and the host charset.
-   We allow conversions to wchar_t and the host charset.
-   Return 1 if we are converting from UTF-32BE, 2 if from UTF32-LE,
-   0 otherwise.  This is used as a flag in calls to iconv.  */
+/* We allow conversions from/to UTF-16, UTF-32, wchar_t, and the host charset.
+   Return a token representing the conversion or -1 if error.  The token
+   is unpacked in iconv.  */
 
-static iconv_t
-phony_iconv_open (const char *to, const char *from)
+#define TOKEN_BITS	3
+#define TOKEN_MASK	((1 << TOKEN_BITS) - 1)
+#define TOKEN_BIGENDIAN	(1 << (TOKEN_BITS - 1))
+
+static int
+phony_iconv_token (const char *encoding)
 {
-  if (strcmp (to, "wchar_t") && strcmp (to, GDB_DEFAULT_HOST_CHARSET))
-    return -1;
+  static struct
+    {
+      const char *name;
+      int token;
+    } const encodings[] =
+    {
+      { "UTF-16",   (2 - 1) | TOKEN_BIGENDIAN },
+      { "UTF-16BE", (2 - 1) | TOKEN_BIGENDIAN },
+      { "UTF-16LE", (2 - 1) },
+      { "UTF-32",   (4 - 1) | TOKEN_BIGENDIAN },
+      { "UTF-32BE", (4 - 1) | TOKEN_BIGENDIAN },
+      { "UTF-32LE", (4 - 1) },
+      { GDB_DEFAULT_HOST_CHARSET, (1 - 1) },
+#if WORDS_BIGENDIAN
+      {	"wchar_t", (sizeof (gdb_wchar_t) - 1) | TOKEN_BIGENDIAN },
+#else
+      {	"wchar_t",  (sizeof (gdb_wchar_t) - 1) },
+#endif
+      { NULL, -1 }
+    };
 
-  if (!strcmp (from, "UTF-32BE") || !strcmp (from, "UTF-32"))
-    return 1;
+  for (auto p = encodings; p->name; p++)
+    if (strcmp (encoding, p->name) == 0)
+      return p->token;
 
-  if (!strcmp (from, "UTF-32LE"))
-    return 2;
+  return -1;
+}
 
-  if (strcmp (from, "wchar_t") && strcmp (from, GDB_DEFAULT_HOST_CHARSET))
+static void
+phony_split_token (int token, size_t &size, enum bfd_endian &endian)
+{
+  /* Extract parameter values from the token. */
+  endian = token & TOKEN_BIGENDIAN ? BFD_ENDIAN_BIG : BFD_ENDIAN_LITTLE;
+  size = (token & TOKEN_MASK & ~TOKEN_BIGENDIAN) + 1;
+}
+
+static iconv_t
+phony_iconv_open (const char *to, const char *from)
+{
+  int totok = phony_iconv_token (to);
+  int fromtok = phony_iconv_token (from);
+
+  if (totok < 0 || fromtok < 0)
     return -1;
 
-  return 0;
+  return (totok << TOKEN_BITS) | fromtok;
 }
 
 static int
@@ -128,60 +164,44 @@ phony_iconv_close (iconv_t arg)
 }
 
 static size_t
-phony_iconv (iconv_t utf_flag, const char **inbuf, size_t *inbytesleft,
+phony_iconv (iconv_t token, const char **inbuf, size_t *inbytesleft,
 	     char **outbuf, size_t *outbytesleft)
 {
-  if (utf_flag)
+  enum bfd_endian toendian, fromendian;
+  size_t tosize, fromsize;
+  unsigned long maxval;
+
+  if(token & ~((TOKEN_MASK << TOKEN_BITS) | TOKEN_MASK))
     {
-      enum bfd_endian endian
-	= utf_flag == 1 ? BFD_ENDIAN_BIG : BFD_ENDIAN_LITTLE;
-      while (*inbytesleft >= 4)
-	{
-	  unsigned long c
-	    = extract_unsigned_integer ((const gdb_byte *)*inbuf, 4, endian);
+      errno = EBADF;
+      return (size_t) -1;
+    }
 
-	  if (c >= 256)
-	    {
-	      errno = EILSEQ;
-	      return -1;
-	    }
-	  if (*outbytesleft < 1)
-	    {
-	      errno = E2BIG;
-	      return -1;
-	    }
-	  **outbuf = c & 0xff;
-	  ++*outbuf;
-	  --*outbytesleft;
+  phony_split_token (token, fromsize, fromendian);
+  phony_split_token (token >> TOKEN_BITS, tosize, toendian);
+  maxval = 1UL << (7 * tosize);	/* Split shift to avoid count overflow. */
+  maxval = (maxval << tosize) - 1;
 
-	  *inbuf += 4;
-	  *inbytesleft -= 4;
-	}
-      if (*inbytesleft)
+  while (*inbytesleft >= fromsize)
+    {
+      unsigned long c = extract_unsigned_integer ((const gdb_byte *) *inbuf,
+                                                  fromsize, fromendian);
+
+      if (c > maxval)
 	{
-	  /* Partial sequence on input.  */
-	  errno = EINVAL;
+	  errno = EILSEQ;
 	  return -1;
 	}
-    }
-  else
-    {
-      /* In all other cases we simply copy input bytes to the
-	 output.  */
-      size_t amt = *inbytesleft;
-
-      if (amt > *outbytesleft)
-	amt = *outbytesleft;
-      memcpy (*outbuf, *inbuf, amt);
-      *inbuf += amt;
-      *outbuf += amt;
-      *inbytesleft -= amt;
-      *outbytesleft -= amt;
-      if (*inbytesleft)
+      if (*outbytesleft < tosize)
 	{
 	  errno = E2BIG;
 	  return -1;
 	}
+      store_unsigned_integer ((gdb_byte *) *outbuf, tosize, toendian, c);
+      *inbuf += fromsize;
+      *inbytesleft -= fromsize;
+      *outbuf += tosize;
+      *outbytesleft -= tosize;
     }
 
   /* The number of non-reversible conversions -- but they were all
-- 
2.37.3

next             reply	other threads:[~2022-10-02 14:00 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-10-02 14:00 Patrick Monnerat [this message]
2022-10-07 20:10 ` Tom Tromey
2022-10-08  0:12   ` Patrick Monnerat
2022-10-08 18:55     ` Tom Tromey
2022-10-09  0:47       ` Patrick Monnerat
2022-10-10 16:11         ` Tom Tromey
2022-10-16  1:50           ` Tom Tromey
2022-10-16  6:24             ` Eli Zaretskii
2022-10-17 23:10               ` Tom Tromey

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20221002140010.106238-1-patrick@monnerat.net \
    --to=patrick@monnerat.net \
    --cc=gdb-patches@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).