[PATCH 0/5] Assume UTF-8 encoding for localedef input files

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH 0/5] Assume UTF-8 encoding for localedef input files
@ 2022-05-19 21:06 Florian Weimer
  2022-05-19 21:06 ` [PATCH 1/5] locale: Turn ADDC and ADDS into functions in linereader.c Florian Weimer
                   ` (5 more replies)
  0 siblings, 6 replies; 15+ messages in thread
From: Florian Weimer @ 2022-05-19 21:06 UTC (permalink / raw)
  To: libc-alpha

This is a backwards-compatible change because of two localedef bugs that
cause bytes outside the ASCII range to produce unpredictable results:

  If char is signed, conversion from the assumed ISO-8859-1 input format
  to a UCS-4 codepoint does not produce the correct result.

  If the output character set is not overlapping ISO-8859-1 in the
  characters used in the locale, the required character set conversion
  is not applied.

This is why I think we can switch to UTF-8 without impacting backwards
compatibility, and there is no need for an option to restore the old
behavior.

Tested on i686-linux-gnu and x86_64-linux-gnu.

Thanks,
Florian

Florian Weimer (5):
  locale: Turn ADDC and ADDS into functions in linereader.c
  locale: Fix signed char bug in lr_getc
  locale: Introduce translate_unicode_codepoint into linereader.c
  locale: localdef input files are now encoded in UTF-8
  de_DE: Convert to UTF-8

 NEWS                         |   4 +
 locale/programs/linereader.c | 504 ++++++++++++++++++++++-------------
 locale/programs/linereader.h |   2 +-
 localedata/locales/de_DE     |  32 +--
 4 files changed, 338 insertions(+), 204 deletions(-)

base-commit: 2d5ec6692f5746ccb11db60976a6481ef8e9d74f
-- 
2.35.3

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 1/5] locale: Turn ADDC and ADDS into functions in linereader.c
  2022-05-19 21:06 [PATCH 0/5] Assume UTF-8 encoding for localedef input files Florian Weimer
@ 2022-05-19 21:06 ` Florian Weimer
  2022-07-04 19:54   ` Carlos O'Donell
  2022-05-19 21:06 ` [PATCH 2/5] locale: Fix signed char bug in lr_getc Florian Weimer
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 15+ messages in thread
From: Florian Weimer @ 2022-05-19 21:06 UTC (permalink / raw)
  To: libc-alpha

And introduce struct lr_buffer.  The functions addc and adds can
be called from functions, enabling subsequent refactoring.
---
 locale/programs/linereader.c | 203 ++++++++++++++++++-----------------
 1 file changed, 104 insertions(+), 99 deletions(-)

diff --git a/locale/programs/linereader.c b/locale/programs/linereader.c
index 146d32e5e2..d5367e0a1e 100644
--- a/locale/programs/linereader.c
+++ b/locale/programs/linereader.c
@@ -416,36 +416,60 @@ get_toplvl_escape (struct linereader *lr)
   return &lr->token;
 }
 
+/* Multibyte string buffer.  */
+struct lr_buffer
+{
+  size_t act;
+  size_t max;
+  char *buf;
+};
 
-#define ADDC(ch) \
-  do									      \
-    {									      \
-      if (bufact == bufmax)						      \
-	{								      \
-	  bufmax *= 2;							      \
-	  buf = xrealloc (buf, bufmax);					      \
-	}								      \
-      buf[bufact++] = (ch);						      \
-    }									      \
-  while (0)
+/* Initialize *LRB with a default-sized buffer.  */
+static void
+lr_buffer_init (struct lr_buffer *lrb)
+{
+ lrb->act = 0;
+ lrb->max = 56;
+ lrb->buf = xmalloc (lrb->max);
+}
 
+/* Transfers the buffer string from *LRB to LR->token.mbstr.  */
+static void
+lr_buffer_to_token (struct lr_buffer *lrb, struct linereader *lr)
+{
+  lr->token.val.str.startmb = xrealloc (lrb->buf, lrb->act + 1);
+  lr->token.val.str.startmb[lrb->act] = '\0';
+  lr->token.val.str.lenmb = lrb->act;
+}
 
-#define ADDS(s, l) \
-  do									      \
-    {									      \
-      size_t _l = (l);							      \
-      if (bufact + _l > bufmax)						      \
-	{								      \
-	  if (bufact < _l)						      \
-	    bufact = _l;						      \
-	  bufmax *= 2;							      \
-	  buf = xrealloc (buf, bufmax);					      \
-	}								      \
-      memcpy (&buf[bufact], s, _l);					      \
-      bufact += _l;							      \
-    }									      \
-  while (0)
+/* Adds CH to *LRB.  */
+static void
+addc (struct lr_buffer *lrb, char ch)
+{
+  if (lrb->act == lrb->max)
+    {
+      lrb->max *= 2;
+      lrb->buf = xrealloc (lrb->buf, lrb->max);
+    }
+  lrb->buf[lrb->act++] = ch;
+}
 
+/* Adds L bytes at S to *LRB.  */
+static void
+adds (struct lr_buffer *lrb, const unsigned char *s, size_t l)
+{
+  if (lrb->max - lrb->act < l)
+    {
+      size_t required_size = lrb->act + l;
+      size_t new_max = 2 * lrb->max;
+      if (new_max < required_size)
+	new_max = required_size;
+      lrb->buf = xrealloc (lrb->buf, new_max);
+      lrb->max = new_max;
+    }
+  memcpy (lrb->buf + lrb->act, s, l);
+  lrb->act += l;
+}
 
 #define ADDWC(ch) \
   do									      \
@@ -467,13 +491,11 @@ get_symname (struct linereader *lr)
      1. reserved words
      2. ISO 10646 position values
      3. all other.  */
-  char *buf;
-  size_t bufact = 0;
-  size_t bufmax = 56;
   const struct keyword_t *kw;
   int ch;
+  struct lr_buffer lrb;
 
-  buf = (char *) xmalloc (bufmax);
+  lr_buffer_init (&lrb);
 
   do
     {
@@ -481,13 +503,13 @@ get_symname (struct linereader *lr)
       if (ch == lr->escape_char)
 	{
 	  int c2 = lr_getc (lr);
-	  ADDC (c2);
+	  addc (&lrb, c2);
 
 	  if (c2 == '\n')
 	    ch = '\n';
 	}
       else
-	ADDC (ch);
+	addc (&lrb, ch);
     }
   while (ch != '>' && ch != '\n');
 
@@ -495,39 +517,35 @@ get_symname (struct linereader *lr)
     lr_error (lr, _("unterminated symbolic name"));
 
   /* Test for ISO 10646 position value.  */
-  if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
+  if (lrb.buf[0] == 'U' && (lrb.act == 6 || lrb.act == 10))
     {
-      char *cp = buf + 1;
-      while (cp < &buf[bufact - 1] && isxdigit (*cp))
+      char *cp = lrb.buf + 1;
+      while (cp < &lrb.buf[lrb.act - 1] && isxdigit (*cp))
 	++cp;
 
-      if (cp == &buf[bufact - 1])
+      if (cp == &lrb.buf[lrb.act - 1])
 	{
 	  /* Yes, it is.  */
 	  lr->token.tok = tok_ucs4;
-	  lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
+	  lr->token.val.ucs4 = strtoul (lrb.buf + 1, NULL, 16);
 
 	  return &lr->token;
 	}
     }
 
   /* It is a symbolic name.  Test for reserved words.  */
-  kw = lr->hash_fct (buf, bufact - 1);
+  kw = lr->hash_fct (lrb.buf, lrb.act - 1);
 
   if (kw != NULL && kw->symname_or_ident == 1)
     {
       lr->token.tok = kw->token;
-      free (buf);
+      free (lrb.buf);
     }
   else
     {
       lr->token.tok = tok_bsymbol;
-
-      buf = xrealloc (buf, bufact + 1);
-      buf[bufact] = '\0';
-
-      lr->token.val.str.startmb = buf;
-      lr->token.val.str.lenmb = bufact - 1;
+      lr_buffer_to_token (&lrb, lr);
+      --lr->token.val.str.lenmb;  /* Hide the training '>'.  */
     }
 
   return &lr->token;
@@ -537,16 +555,13 @@ get_symname (struct linereader *lr)
 static struct token *
 get_ident (struct linereader *lr)
 {
-  char *buf;
-  size_t bufact;
-  size_t bufmax = 56;
   const struct keyword_t *kw;
   int ch;
+  struct lr_buffer lrb;
 
-  buf = xmalloc (bufmax);
-  bufact = 0;
+  lr_buffer_init (&lrb);
 
-  ADDC (lr->buf[lr->idx - 1]);
+  addc (&lrb, lr->buf[lr->idx - 1]);
 
   while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
 	 && ch != '<' && ch != ',' && ch != EOF)
@@ -560,27 +575,22 @@ get_ident (struct linereader *lr)
 	      break;
 	    }
 	}
-      ADDC (ch);
+      addc (&lrb, ch);
     }
 
   lr_ungetc (lr, ch);
 
-  kw = lr->hash_fct (buf, bufact);
+  kw = lr->hash_fct (lrb.buf, lrb.act);
 
   if (kw != NULL && kw->symname_or_ident == 0)
     {
       lr->token.tok = kw->token;
-      free (buf);
+      free (lrb.buf);
     }
   else
     {
       lr->token.tok = tok_ident;
-
-      buf = xrealloc (buf, bufact + 1);
-      buf[bufact] = '\0';
-
-      lr->token.val.str.startmb = buf;
-      lr->token.val.str.lenmb = bufact;
+      lr_buffer_to_token (&lrb, lr);
     }
 
   return &lr->token;
@@ -593,14 +603,10 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 	    int verbose)
 {
   int return_widestr = lr->return_widestr;
-  char *buf;
+  struct lr_buffer lrb;
   wchar_t *buf2 = NULL;
-  size_t bufact;
-  size_t bufmax = 56;
 
-  /* We must return two different strings.  */
-  buf = xmalloc (bufmax);
-  bufact = 0;
+  lr_buffer_init (&lrb);
 
   /* We know it'll be a string.  */
   lr->token.tok = tok_string;
@@ -613,19 +619,19 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 
       buf2 = NULL;
       while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
-	ADDC (ch);
+	addc (&lrb, ch);
 
       /* Catch errors with trailing escape character.  */
-      if (bufact > 0 && buf[bufact - 1] == lr->escape_char
-	  && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
+      if (lrb.act > 0 && lrb.buf[lrb.act - 1] == lr->escape_char
+	  && (lrb.act == 1 || lrb.buf[lrb.act - 2] != lr->escape_char))
 	{
 	  lr_error (lr, _("illegal escape sequence at end of string"));
-	  --bufact;
+	  --lrb.act;
 	}
       else if (ch == '\n' || ch == EOF)
 	lr_error (lr, _("unterminated string"));
 
-      ADDC ('\0');
+      addc (&lrb, '\0');
     }
   else
     {
@@ -662,7 +668,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 		    break;
 		}
 
-	      ADDC (ch);
+	      addc (&lrb, ch);
 	      if (return_widestr)
 		ADDWC ((uint32_t) ch);
 
@@ -671,7 +677,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 
 	  /* Now we have to search for the end of the symbolic name, i.e.,
 	     the closing '>'.  */
-	  startidx = bufact;
+	  startidx = lrb.act;
 	  while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
 	    {
 	      if (ch == lr->escape_char)
@@ -680,12 +686,12 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 		  if (ch == '\n' || ch == EOF)
 		    break;
 		}
-	      ADDC (ch);
+	      addc (&lrb, ch);
 	    }
 	  if (ch == '\n' || ch == EOF)
 	    /* Not a correct string.  */
 	    break;
-	  if (bufact == startidx)
+	  if (lrb.act == startidx)
 	    {
 	      /* <> is no correct name.  Ignore it and also signal an
 		 error.  */
@@ -694,23 +700,23 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 	    }
 
 	  /* It might be a Uxxxx symbol.  */
-	  if (buf[startidx] == 'U'
-	      && (bufact - startidx == 5 || bufact - startidx == 9))
+	  if (lrb.buf[startidx] == 'U'
+	      && (lrb.act - startidx == 5 || lrb.act - startidx == 9))
 	    {
-	      char *cp = buf + startidx + 1;
-	      while (cp < &buf[bufact] && isxdigit (*cp))
+	      char *cp = lrb.buf + startidx + 1;
+	      while (cp < &lrb.buf[lrb.act] && isxdigit (*cp))
 		++cp;
 
-	      if (cp == &buf[bufact])
+	      if (cp == &lrb.buf[lrb.act])
 		{
 		  char utmp[10];
 
 		  /* Yes, it is.  */
-		  ADDC ('\0');
-		  wch = strtoul (buf + startidx + 1, NULL, 16);
+		  addc (&lrb, '\0');
+		  wch = strtoul (lrb.buf + startidx + 1, NULL, 16);
 
 		  /* Now forget about the name we just added.  */
-		  bufact = startidx;
+		  lrb.act = startidx;
 
 		  if (return_widestr)
 		    ADDWC (wch);
@@ -774,7 +780,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 				      seq = charmap_find_value (charmap, utmp,
 								9);
 				      assert (seq != NULL);
-				      ADDS (seq->bytes, seq->nbytes);
+				      adds (&lrb, seq->bytes, seq->nbytes);
 				    }
 
 				  continue;
@@ -788,24 +794,24 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 		    }
 
 		  if (seq != NULL)
-		    ADDS (seq->bytes, seq->nbytes);
+		    adds (&lrb, seq->bytes, seq->nbytes);
 
 		  continue;
 		}
 	    }
 
-	  /* We now have the symbolic name in buf[startidx] to
-	     buf[bufact-1].  Now find out the value for this character
+	  /* We now have the symbolic name in lrb.buf[startidx] to
+	     lrb.buf[lrb.act-1].  Now find out the value for this character
 	     in the charmap as well as in the repertoire map (in this
 	     order).  */
-	  seq = charmap_find_value (charmap, &buf[startidx],
-				    bufact - startidx);
+	  seq = charmap_find_value (charmap, &lrb.buf[startidx],
+				    lrb.act - startidx);
 
 	  if (seq == NULL)
 	    {
 	      /* This name is not in the charmap.  */
 	      lr_error (lr, _("symbol `%.*s' not in charmap"),
-			(int) (bufact - startidx), &buf[startidx]);
+			(int) (lrb.act - startidx), &lrb.buf[startidx]);
 	      illegal_string = 1;
 	    }
 
@@ -816,8 +822,8 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 		wch = seq->ucs4;
 	      else
 		{
-		  wch = repertoire_find_value (repertoire, &buf[startidx],
-					       bufact - startidx);
+		  wch = repertoire_find_value (repertoire, &lrb.buf[startidx],
+					       lrb.act - startidx);
 		  if (seq != NULL)
 		    seq->ucs4 = wch;
 		}
@@ -826,7 +832,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 		{
 		  /* This name is not in the repertoire map.  */
 		  lr_error (lr, _("symbol `%.*s' not in repertoire map"),
-			    (int) (bufact - startidx), &buf[startidx]);
+			    (int) (lrb.act - startidx), &lrb.buf[startidx]);
 		  illegal_string = 1;
 		}
 	      else
@@ -834,11 +840,11 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 	    }
 
 	  /* Now forget about the name we just added.  */
-	  bufact = startidx;
+	  lrb.act = startidx;
 
 	  /* And copy the bytes.  */
 	  if (seq != NULL)
-	    ADDS (seq->bytes, seq->nbytes);
+	    adds (&lrb, seq->bytes, seq->nbytes);
 	}
 
       if (ch == '\n' || ch == EOF)
@@ -849,7 +855,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 
       if (illegal_string)
 	{
-	  free (buf);
+	  free (lrb.buf);
 	  free (buf2);
 	  lr->token.val.str.startmb = NULL;
 	  lr->token.val.str.lenmb = 0;
@@ -859,7 +865,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 	  return &lr->token;
 	}
 
-      ADDC ('\0');
+      addc (&lrb, '\0');
 
       if (return_widestr)
 	{
@@ -870,8 +876,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 	}
     }
 
-  lr->token.val.str.startmb = xrealloc (buf, bufact);
-  lr->token.val.str.lenmb = bufact;
+  lr_buffer_to_token (&lrb, lr);
 
   return &lr->token;
 }
-- 
2.35.3



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 2/5] locale: Fix signed char bug in lr_getc
  2022-05-19 21:06 [PATCH 0/5] Assume UTF-8 encoding for localedef input files Florian Weimer
  2022-05-19 21:06 ` [PATCH 1/5] locale: Turn ADDC and ADDS into functions in linereader.c Florian Weimer
@ 2022-05-19 21:06 ` Florian Weimer
  2022-07-04 19:54   ` Carlos O'Donell
  2022-05-19 21:06 ` [PATCH 3/5] locale: Introduce translate_unicode_codepoint into linereader.c Florian Weimer
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 15+ messages in thread
From: Florian Weimer @ 2022-05-19 21:06 UTC (permalink / raw)
  To: libc-alpha

The array lr->buf contains characters, which can be signed.  A 0xff
byte in the input could be incorrectly reported as EOF.  More
importantly, get_string in linereader.c converts a signed input byte
to a Unicode code point using ADDWC ((uint32_t) ch), under the
assumption that this decodes the ISO-8859-1 input encoding.  If char
is signed, this does not give the correct result.  This means that
ISO-8859-1 input files for localedef are not actually supported,
contrary to the comment in get_string.  This is a happy accident because
we can therefore change the file encoding to UTF-8 without impacting
backwards compatibility.

While at it, remove the \32 check for MS-DOS end-of-file character (^Z).
---
 locale/programs/linereader.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/locale/programs/linereader.h b/locale/programs/linereader.h
index 0fb10ec833..653a71d2d1 100644
--- a/locale/programs/linereader.h
+++ b/locale/programs/linereader.h
@@ -134,7 +134,7 @@ lr_getc (struct linereader *lr)
 	return EOF;
     }

-  return lr->buf[lr->idx] == '\32' ? EOF : lr->buf[lr->idx++];
+  return lr->buf[lr->idx++] & 0xff;
 }

-- 
2.35.3

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 3/5] locale: Introduce translate_unicode_codepoint into linereader.c
  2022-05-19 21:06 [PATCH 0/5] Assume UTF-8 encoding for localedef input files Florian Weimer
  2022-05-19 21:06 ` [PATCH 1/5] locale: Turn ADDC and ADDS into functions in linereader.c Florian Weimer
  2022-05-19 21:06 ` [PATCH 2/5] locale: Fix signed char bug in lr_getc Florian Weimer
@ 2022-05-19 21:06 ` Florian Weimer
  2022-07-04 19:54   ` Carlos O'Donell
  2022-05-19 21:06 ` [PATCH 4/5] locale: localdef input files are now encoded in UTF-8 Florian Weimer
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 15+ messages in thread
From: Florian Weimer @ 2022-05-19 21:06 UTC (permalink / raw)
  To: libc-alpha

This will permit reusing the Unicode character processing for
different character encodings, not just the current <U...> encoding.
---
 locale/programs/linereader.c | 167 ++++++++++++++++++-----------------
 1 file changed, 85 insertions(+), 82 deletions(-)

diff --git a/locale/programs/linereader.c b/locale/programs/linereader.c
index d5367e0a1e..f7292f0102 100644
--- a/locale/programs/linereader.c
+++ b/locale/programs/linereader.c
@@ -596,6 +596,83 @@ get_ident (struct linereader *lr)
   return &lr->token;
 }
 
+/* Process a decoded Unicode codepoint WCH in a string, placing the
+   multibyte sequence into LRB.  Return false if the character is not
+   found in CHARMAP/REPERTOIRE.  */
+static bool
+translate_unicode_codepoint (struct localedef_t *locale,
+			     const struct charmap_t *charmap,
+			     const struct repertoire_t *repertoire,
+			     uint32_t wch, struct lr_buffer *lrb)
+{
+  /* See whether the charmap contains the Uxxxxxxxx names.  */
+  char utmp[10];
+  snprintf (utmp, sizeof (utmp), "U%08X", wch);
+  struct charseq *seq = charmap_find_value (charmap, utmp, 9);
+
+  if (seq == NULL)
+    {
+      /* No, this isn't the case.  Now determine from
+	 the repertoire the name of the character and
+	 find it in the charmap.  */
+      if (repertoire != NULL)
+	{
+	  const char *symbol = repertoire_find_symbol (repertoire, wch);
+	  if (symbol != NULL)
+	    seq = charmap_find_value (charmap, symbol, strlen (symbol));
+	}
+
+      if (seq == NULL)
+	{
+#ifndef NO_TRANSLITERATION
+	  /* Transliterate if possible.  */
+	  if (locale != NULL)
+	    {
+	      if ((locale->avail & CTYPE_LOCALE) == 0)
+		{
+		  /* Load the CTYPE data now.  */
+		  int old_needed = locale->needed;
+
+		  locale->needed = 0;
+		  locale = load_locale (LC_CTYPE, locale->name,
+					locale->repertoire_name,
+					charmap, locale);
+		  locale->needed = old_needed;
+		}
+
+	      uint32_t *translit;
+	      if ((locale->avail & CTYPE_LOCALE) != 0
+		  && ((translit = find_translit (locale, charmap, wch))
+		      != NULL))
+		/* The CTYPE data contains a matching
+		   transliteration.  */
+		{
+		  for (int i = 0; translit[i] != 0; ++i)
+		    {
+		      snprintf (utmp, sizeof (utmp), "U%08X", translit[i]);
+		      seq = charmap_find_value (charmap, utmp, 9);
+		      assert (seq != NULL);
+		      adds (lrb, seq->bytes, seq->nbytes);
+		    }
+		  return true;
+		}
+	    }
+#endif	/* NO_TRANSLITERATION */
+
+	  /* Not a known name.  */
+	  return false;
+	}
+    }
+
+  if (seq != NULL)
+    {
+      adds (lrb, seq->bytes, seq->nbytes);
+      return true;
+    }
+  else
+    return false;
+}
+
 
 static struct token *
 get_string (struct linereader *lr, const struct charmap_t *charmap,
@@ -635,7 +712,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
     }
   else
     {
-      int illegal_string = 0;
+      bool illegal_string = false;
       size_t buf2act = 0;
       size_t buf2max = 56 * sizeof (uint32_t);
       int ch;
@@ -695,7 +772,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 	    {
 	      /* <> is no correct name.  Ignore it and also signal an
 		 error.  */
-	      illegal_string = 1;
+	      illegal_string = true;
 	      continue;
 	    }
 
@@ -709,8 +786,6 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 
 	      if (cp == &lrb.buf[lrb.act])
 		{
-		  char utmp[10];
-
 		  /* Yes, it is.  */
 		  addc (&lrb, '\0');
 		  wch = strtoul (lrb.buf + startidx + 1, NULL, 16);
@@ -721,81 +796,9 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 		  if (return_widestr)
 		    ADDWC (wch);
 
-		  /* See whether the charmap contains the Uxxxxxxxx names.  */
-		  snprintf (utmp, sizeof (utmp), "U%08X", wch);
-		  seq = charmap_find_value (charmap, utmp, 9);
-
-		  if (seq == NULL)
-		    {
-		     /* No, this isn't the case.  Now determine from
-			the repertoire the name of the character and
-			find it in the charmap.  */
-		      if (repertoire != NULL)
-			{
-			  const char *symbol;
-
-			  symbol = repertoire_find_symbol (repertoire, wch);
-
-			  if (symbol != NULL)
-			    seq = charmap_find_value (charmap, symbol,
-						      strlen (symbol));
-			}
-
-		      if (seq == NULL)
-			{
-#ifndef NO_TRANSLITERATION
-			  /* Transliterate if possible.  */
-			  if (locale != NULL)
-			    {
-			      uint32_t *translit;
-
-			      if ((locale->avail & CTYPE_LOCALE) == 0)
-				{
-				  /* Load the CTYPE data now.  */
-				  int old_needed = locale->needed;
-
-				  locale->needed = 0;
-				  locale = load_locale (LC_CTYPE,
-							locale->name,
-							locale->repertoire_name,
-							charmap, locale);
-				  locale->needed = old_needed;
-				}
-
-			      if ((locale->avail & CTYPE_LOCALE) != 0
-				  && ((translit = find_translit (locale,
-								 charmap, wch))
-				      != NULL))
-				/* The CTYPE data contains a matching
-				   transliteration.  */
-				{
-				  int i;
-
-				  for (i = 0; translit[i] != 0; ++i)
-				    {
-				      char utmp[10];
-
-				      snprintf (utmp, sizeof (utmp), "U%08X",
-						translit[i]);
-				      seq = charmap_find_value (charmap, utmp,
-								9);
-				      assert (seq != NULL);
-				      adds (&lrb, seq->bytes, seq->nbytes);
-				    }
-
-				  continue;
-				}
-			    }
-#endif	/* NO_TRANSLITERATION */
-
-			  /* Not a known name.  */
-			  illegal_string = 1;
-			}
-		    }
-
-		  if (seq != NULL)
-		    adds (&lrb, seq->bytes, seq->nbytes);
-
+		  if (!translate_unicode_codepoint (locale, charmap,
+						    repertoire, wch, &lrb))
+		    illegal_string = true;
 		  continue;
 		}
 	    }
@@ -812,7 +815,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 	      /* This name is not in the charmap.  */
 	      lr_error (lr, _("symbol `%.*s' not in charmap"),
 			(int) (lrb.act - startidx), &lrb.buf[startidx]);
-	      illegal_string = 1;
+	      illegal_string = true;
 	    }
 
 	  if (return_widestr)
@@ -833,7 +836,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 		  /* This name is not in the repertoire map.  */
 		  lr_error (lr, _("symbol `%.*s' not in repertoire map"),
 			    (int) (lrb.act - startidx), &lrb.buf[startidx]);
-		  illegal_string = 1;
+		  illegal_string = true;
 		}
 	      else
 		ADDWC (wch);
@@ -850,7 +853,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
       if (ch == '\n' || ch == EOF)
 	{
 	  lr_error (lr, _("unterminated string"));
-	  illegal_string = 1;
+	  illegal_string = true;
 	}
 
       if (illegal_string)
-- 
2.35.3



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 4/5] locale: localdef input files are now encoded in UTF-8
  2022-05-19 21:06 [PATCH 0/5] Assume UTF-8 encoding for localedef input files Florian Weimer
                   ` (2 preceding siblings ...)
  2022-05-19 21:06 ` [PATCH 3/5] locale: Introduce translate_unicode_codepoint into linereader.c Florian Weimer
@ 2022-05-19 21:06 ` Florian Weimer
  2022-07-04 19:54   ` Carlos O'Donell
  2022-05-19 21:06 ` [PATCH 5/5] de_DE: Convert to UTF-8 Florian Weimer
  2022-07-04 19:54 ` [PATCH 0/5] Assume UTF-8 encoding for localedef input files Carlos O'Donell
  5 siblings, 1 reply; 15+ messages in thread
From: Florian Weimer @ 2022-05-19 21:06 UTC (permalink / raw)
  To: libc-alpha

Previously, they were assumed to be in ISO-8859-1, and that the output
charset overlapped with ISO-8859-1 for the characters actually used.
However, this did not work as intended on many architectures even for
an ISO-8859-1 output encoding because of the char signedness bug in
lr_getc.  Therefore, this commit switches to UTF-8 without making
provisions for backwards compatibility.

The following Elisp code can be used to convert locale definition files
to UTF-8:

(defun glibc/convert-localedef (from to)
  (interactive "r")
  (save-excursion
    (save-restriction
      (narrow-to-region from to)
      (goto-char (point-min))
      (save-match-data
	(while (re-search-forward "<U\\([0-9a-fA-F]+\\)>" nil t)
	  (let* ((codepoint (string-to-number (match-string 1) 16))
		 (converted
		  (cond
		   ((memq codepoint '(?/ ?\ ?< ?>))
		    (string ?/ codepoint))
		   ((= codepoint ?\") "<U0022>")
		   (t (string codepoint)))))
	    (replace-match converted t)))))))
---
 NEWS                         |   4 +
 locale/programs/linereader.c | 144 ++++++++++++++++++++++++++++++++---
 2 files changed, 137 insertions(+), 11 deletions(-)

diff --git a/NEWS b/NEWS
index ad0c08d8ca..7ce0d8a135 100644
--- a/NEWS
+++ b/NEWS
@@ -20,6 +20,10 @@ Major new features:
   have been added.  The pidfd functionality provides access to a process
   while avoiding the issue of PID reuse on tranditional Unix systems.
 
+* localedef now accepts locale definition files encoded in UTF-8.
+  Previously, input bytes not within the ASCII range resulted in
+  unpredictable output.
+
 Deprecated and removed features, and other changes affecting compatibility:
 
 * Support for prelink will be removed in the next release; this includes
diff --git a/locale/programs/linereader.c b/locale/programs/linereader.c
index f7292f0102..b484327969 100644
--- a/locale/programs/linereader.c
+++ b/locale/programs/linereader.c
@@ -42,6 +42,7 @@ static struct token *get_string (struct linereader *lr,
 				 struct localedef_t *locale,
 				 const struct repertoire_t *repertoire,
 				 int verbose);
+static bool utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch);
 
 
 struct linereader *
@@ -327,6 +328,17 @@ lr_token (struct linereader *lr, const struct charmap_t *charmap,
 	}
       lr_ungetn (lr, 2);
       break;
+
+    case 0x80 ... 0xff:		/* UTF-8 sequence.  */
+      uint32_t wch;
+      if (!utf8_decode (lr, ch, &wch))
+	{
+	  lr->token.tok = tok_error;
+	  return &lr->token;
+	}
+      lr->token.tok = tok_ucs4;
+      lr->token.val.ucs4 = wch;
+      return &lr->token;
     }
 
   return get_ident (lr);
@@ -673,6 +685,87 @@ translate_unicode_codepoint (struct localedef_t *locale,
     return false;
 }
 
+/* Returns true if ch is not EOF (that is, non-negative) and a valid
+   UTF-8 trailing byte.  */
+static bool
+utf8_valid_trailing (int ch)
+{
+  return ch >= 0 && (ch & 0xc0) == 0x80;
+}
+
+/* Reports an error for a broken UTF-8 sequence.  CH2 to CH4 may be
+   EOF.  Always returns false.  */
+static bool
+utf8_sequence_error (struct linereader *lr, uint8_t ch1, int ch2, int ch3,
+		     int ch4)
+{
+  char buf[30];
+
+  if (ch2 < 0)
+    snprintf (buf, sizeof (buf), "0x%02x", ch1);
+  else if (ch3 < 0)
+    snprintf (buf, sizeof (buf), "0x%02x 0x%02x", ch1, ch2);
+  else if (ch4 < 0)
+    snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x", ch1, ch2, ch3);
+  else
+    snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x 0x%02x",
+	      ch1, ch2, ch3, ch4);
+
+  lr_error (lr, _("invalid UTF-8 sequence %s"), buf);
+  return false;
+}
+
+/* Reads a UTF-8 sequence from LR, with the leading byte CH1, and
+   stores the decoded codepoint in *WCH.  Returns false on failure and
+   reports an error.  */
+static bool
+utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch)
+{
+  /* See RFC 3629 section 4 and __gconv_transform_utf8_internal.  */
+  if (ch1 < 0xc2)
+    return utf8_sequence_error (lr, ch1, -1, -1, -1);
+
+  int ch2 = lr_getc (lr);
+  if (!utf8_valid_trailing (ch2))
+    return utf8_sequence_error (lr, ch1, ch2, -1, -1);
+
+  if (ch1 <= 0xdf)
+    {
+      uint32_t result = ((ch1 & 0x1f)  << 6) | (ch2 & 0x3f);
+      if (result < 0x80)
+	return utf8_sequence_error (lr, ch1, ch2, -1, -1);
+      *wch = result;
+      return true;
+    }
+
+  int ch3 = lr_getc (lr);
+  if (!utf8_valid_trailing (ch3) || ch1 < 0xe0)
+    return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
+
+  if (ch1 <= 0xef)
+    {
+      uint32_t result = (((ch1 & 0x0f)  << 12)
+			 | ((ch2 & 0x3f) << 6)
+			 | (ch3 & 0x3f));
+      if (result < 0x800)
+	return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
+      *wch = result;
+      return true;
+    }
+
+  int ch4 = lr_getc (lr);
+  if (!utf8_valid_trailing (ch4) || ch1 < 0xf0 || ch1 > 0xf4)
+    return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
+
+  uint32_t result = (((ch1 & 0x07)  << 18)
+		     | ((ch2 & 0x3f) << 12)
+		     | ((ch3 & 0x3f) << 6)
+		     | (ch4 & 0x3f));
+  if (result < 0x10000)
+    return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
+  *wch = result;
+  return true;
+}
 
 static struct token *
 get_string (struct linereader *lr, const struct charmap_t *charmap,
@@ -696,7 +789,11 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 
       buf2 = NULL;
       while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
-	addc (&lrb, ch);
+	{
+	  if (ch >= 0x80)
+	    lr_error (lr, _("illegal 8-bit character in untranslated string"));
+	  addc (&lrb, ch);
+	}
 
       /* Catch errors with trailing escape character.  */
       if (lrb.act > 0 && lrb.buf[lrb.act - 1] == lr->escape_char
@@ -730,24 +827,49 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
 
 	  if (ch != '<')
 	    {
-	      /* The standards leave it up to the implementation to decide
-		 what to do with character which stand for themself.  We
-		 could jump through hoops to find out the value relative to
-		 the charmap and the repertoire map, but instead we leave
-		 it up to the locale definition author to write a better
-		 definition.  We assume here that every character which
-		 stands for itself is encoded using ISO 8859-1.  Using the
-		 escape character is allowed.  */
+	      /* The standards leave it up to the implementation to
+		 decide what to do with characters which stand for
+		 themselves.  This implementation treats the input
+		 file as encoded in UTF-8.  */
 	      if (ch == lr->escape_char)
 		{
 		  ch = lr_getc (lr);
+		  if (ch >= 0x80)
+		    {
+		      lr_error (lr, _("illegal 8-bit escape sequence"));
+		      illegal_string = true;
+		      break;
+		    }
 		  if (ch == '\n' || ch == EOF)
 		    break;
+		  addc (&lrb, ch);
+		  wch = ch;
+		}
+	      else if (ch < 0x80)
+		{
+		  wch = ch;
+		  addc (&lrb, ch);
+		}
+	      else 		/* UTF-8 sequence.  */
+		{
+		  if (!utf8_decode (lr, ch, &wch))
+		    {
+		      illegal_string = true;
+		      break;
+		    }
+		  if (!translate_unicode_codepoint (locale, charmap,
+						    repertoire, wch, &lrb))
+		    {
+		      /* Ignore the rest of the string.  Callers may
+			 skip this string because it cannot be encoded
+			 in the output character set.  */
+		      illegal_string = true;
+		      continue;
+		    }
 		}
 
-	      addc (&lrb, ch);
 	      if (return_widestr)
-		ADDWC ((uint32_t) ch);
+		ADDWC (wch);
 
 	      continue;
 	    }
-- 
2.35.3



^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH 5/5] de_DE: Convert to UTF-8
  2022-05-19 21:06 [PATCH 0/5] Assume UTF-8 encoding for localedef input files Florian Weimer
                   ` (3 preceding siblings ...)
  2022-05-19 21:06 ` [PATCH 4/5] locale: localdef input files are now encoded in UTF-8 Florian Weimer
@ 2022-05-19 21:06 ` Florian Weimer
  2022-07-04 19:54   ` Carlos O'Donell
  2022-07-05  9:27   ` Andreas Schwab
  2022-07-04 19:54 ` [PATCH 0/5] Assume UTF-8 encoding for localedef input files Carlos O'Donell
  5 siblings, 2 replies; 15+ messages in thread
From: Florian Weimer @ 2022-05-19 21:06 UTC (permalink / raw)
  To: libc-alpha

---
 localedata/locales/de_DE | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/localedata/locales/de_DE b/localedata/locales/de_DE
index 49a421fff4..52767808f7 100644
--- a/localedata/locales/de_DE
+++ b/localedata/locales/de_DE
@@ -46,36 +46,36 @@ include "translit_combining";""
 
 % German umlauts.
 % LATIN CAPITAL LETTER A WITH DIAERESIS.
-<U00C4> "A<U0308>";"AE"
+Ä "Ä";"AE"
 % LATIN CAPITAL LETTER O WITH DIAERESIS.
-<U00D6> "O<U0308>";"OE"
+Ö "Ö";"OE"
 % LATIN CAPITAL LETTER U WITH DIAERESIS.
-<U00DC> "U<U0308>";"UE"
+Ü "Ü";"UE"
 % LATIN SMALL LETTER A WITH DIAERESIS.
-<U00E4> "a<U0308>";"ae"
+ä "ä";"ae"
 % LATIN SMALL LETTER O WITH DIAERESIS.
-<U00F6> "o<U0308>";"oe"
+ö "ö";"oe"
 % LATIN SMALL LETTER U WITH DIAERESIS.
-<U00FC> "u<U0308>";"ue"
+ü "ü";"ue"
 
 % Danish.
 % LATIN CAPITAL LETTER A WITH RING ABOVE.
-<U00C5> "A<U030A>";"AA"
+Å "Å";"AA"
 % LATIN SMALL LETTER A WITH RING ABOVE.
-<U00E5> "a<U030A>";"aa"
+å "å";"aa"
 
 % The following strange first-level transliteration derive from the use
 % U201E and U201C as "correct" quoting characters.  These two characters
 % do not really belong together.  The result is that somebody who uses
 % U201C and U201D will get the incorrect U00AB / U00BB sequences.
 % LEFT DOUBLE QUOTATION MARK
-<U201C> <U00AB>;<U0022>
+“ «;<U0022>
 % RIGHT DOUBLE QUOTATION MARK
-<U201D> <U00BB>;<U0022>
+” »;<U0022>
 % DOUBLE LOW-9 QUOTATION MARK
-<U201E> <U00BB>;"<U002C><U002C>"
+„ »;",,"
 % DOUBLE HIGH-REVERSED-9 QUOTATION MARK
-<U201F> <U00AB>;<U0022>
+‟ «;<U0022>
 
 translit_end
 
@@ -90,7 +90,7 @@ END LC_COLLATE
 
 LC_MONETARY
 int_curr_symbol     "EUR "
-currency_symbol     "<U20AC>"
+currency_symbol     "€"
 mon_decimal_point   ","
 mon_thousands_sep   "."
 mon_grouping        3;3
@@ -126,14 +126,14 @@ day	"Sonntag";/
 	"Freitag";/
 	"Samstag"
 abmon	"Jan";"Feb";/
-	"M<U00E4>r";"Apr";/
+	"Mär";"Apr";/
 	"Mai";"Jun";/
 	"Jul";"Aug";/
 	"Sep";"Okt";/
 	"Nov";"Dez"
 mon	"Januar";/
 	"Februar";/
-	"M<U00E4>rz";/
+	"März";/
 	"April";/
 	"Mai";/
 	"Juni";/
@@ -172,7 +172,7 @@ END LC_PAPER
 
 LC_NAME
 name_fmt    "%d%t%g%t%m%t%f"
-name_miss   "Fr<U00E4>ulein"
+name_miss   "Fräulein"
 name_mr     "Herr"
 name_mrs    "Frau"
 name_ms     "Frau"
-- 
2.35.3


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 0/5] Assume UTF-8 encoding for localedef input files
  2022-05-19 21:06 [PATCH 0/5] Assume UTF-8 encoding for localedef input files Florian Weimer
                   ` (4 preceding siblings ...)
  2022-05-19 21:06 ` [PATCH 5/5] de_DE: Convert to UTF-8 Florian Weimer
@ 2022-07-04 19:54 ` Carlos O'Donell
  5 siblings, 0 replies; 15+ messages in thread
From: Carlos O'Donell @ 2022-07-04 19:54 UTC (permalink / raw)
  To: Florian Weimer, libc-alpha

On 5/19/22 17:06, Florian Weimer via Libc-alpha wrote:
> This is a backwards-compatible change because of two localedef bugs that
> cause bytes outside the ASCII range to produce unpredictable results:
> 
>   If char is signed, conversion from the assumed ISO-8859-1 input format
>   to a UCS-4 codepoint does not produce the correct result.
> 
>   If the output character set is not overlapping ISO-8859-1 in the
>   characters used in the locale, the required character set conversion
>   is not applied.
> 
> This is why I think we can switch to UTF-8 without impacting backwards
> compatibility, and there is no need for an option to restore the old
> behavior.

I can agree with that. In some sense I think the parsing of locale files is something
that can require developers and users to adjust the syntax, though we'd like for it
to be backwards compatible. In this case it couldn't have worked.

Thank you for working on this to make locales easier to use.

I particularly appreciate the example conversion of de_DE.

Overall the series looks good and we should commit this ahead of glibc 2.36 so we can
get any new strings translated for the TP project. This series particularly adds some
error messages for the use of UTF-8 in the locale sources.

Again, I really appreciate that this makes it easier for natural language speakers
to write, adjust, and review locale sources. In cases where disambiguation is required
we still have the capacity to write it differently if we need to. This continues the
early work to convert from U-codes to ASCII.

Just like last time we had this discussion the idea that glibc would support
compiling locale sources on a system that lacks UTF-8 is no longer a requirement
that we should have for the library.

> Tested on i686-linux-gnu and x86_64-linux-gnu.
> 
> Thanks,
> Florian
> 
> Florian Weimer (5):
>   locale: Turn ADDC and ADDS into functions in linereader.c
>   locale: Fix signed char bug in lr_getc
>   locale: Introduce translate_unicode_codepoint into linereader.c
>   locale: localdef input files are now encoded in UTF-8
>   de_DE: Convert to UTF-8
> 
>  NEWS                         |   4 +
>  locale/programs/linereader.c | 504 ++++++++++++++++++++++-------------
>  locale/programs/linereader.h |   2 +-
>  localedata/locales/de_DE     |  32 +--
>  4 files changed, 338 insertions(+), 204 deletions(-)
> 
> 
> base-commit: 2d5ec6692f5746ccb11db60976a6481ef8e9d74f

-- 
Cheers,
Carlos.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 1/5] locale: Turn ADDC and ADDS into functions in linereader.c
  2022-05-19 21:06 ` [PATCH 1/5] locale: Turn ADDC and ADDS into functions in linereader.c Florian Weimer
@ 2022-07-04 19:54   ` Carlos O'Donell
  0 siblings, 0 replies; 15+ messages in thread
From: Carlos O'Donell @ 2022-07-04 19:54 UTC (permalink / raw)
  To: Florian Weimer, libc-alpha

On 5/19/22 17:06, Florian Weimer via Libc-alpha wrote:
> And introduce struct lr_buffer.  The functions addc and adds can
> be called from functions, enabling subsequent refactoring.

LGTM.

Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Tested-by: Carlos O'Donell <carlos@redhat.com>

> ---
>  locale/programs/linereader.c | 203 ++++++++++++++++++-----------------
>  1 file changed, 104 insertions(+), 99 deletions(-)
> 
> diff --git a/locale/programs/linereader.c b/locale/programs/linereader.c
> index 146d32e5e2..d5367e0a1e 100644
> --- a/locale/programs/linereader.c
> +++ b/locale/programs/linereader.c
> @@ -416,36 +416,60 @@ get_toplvl_escape (struct linereader *lr)
>    return &lr->token;
>  }
>  
> +/* Multibyte string buffer.  */
> +struct lr_buffer
> +{
> +  size_t act;
> +  size_t max;
> +  char *buf;
> +};
>  
> -#define ADDC(ch) \
> -  do									      \
> -    {									      \
> -      if (bufact == bufmax)						      \
> -	{								      \
> -	  bufmax *= 2;							      \
> -	  buf = xrealloc (buf, bufmax);					      \
> -	}								      \
> -      buf[bufact++] = (ch);						      \
> -    }									      \
> -  while (0)

OK. Removing macros is awesome.

> +/* Initialize *LRB with a default-sized buffer.  */
> +static void
> +lr_buffer_init (struct lr_buffer *lrb)
> +{
> + lrb->act = 0;
> + lrb->max = 56;

OK. Taken from original code for a 56 byte maximum.

> + lrb->buf = xmalloc (lrb->max);
> +}

OK.

>  
> +/* Transfers the buffer string from *LRB to LR->token.mbstr.  */
> +static void
> +lr_buffer_to_token (struct lr_buffer *lrb, struct linereader *lr)
> +{
> +  lr->token.val.str.startmb = xrealloc (lrb->buf, lrb->act + 1);
> +  lr->token.val.str.startmb[lrb->act] = '\0';
> +  lr->token.val.str.lenmb = lrb->act;
> +}

OK.

>  
> -#define ADDS(s, l) \
> -  do									      \
> -    {									      \
> -      size_t _l = (l);							      \
> -      if (bufact + _l > bufmax)						      \
> -	{								      \
> -	  if (bufact < _l)						      \
> -	    bufact = _l;						      \
> -	  bufmax *= 2;							      \
> -	  buf = xrealloc (buf, bufmax);					      \
> -	}								      \
> -      memcpy (&buf[bufact], s, _l);					      \
> -      bufact += _l;							      \
> -    }									      \
> -  while (0)

OK. Removing macros is awesome.

> +/* Adds CH to *LRB.  */
> +static void
> +addc (struct lr_buffer *lrb, char ch)
> +{
> +  if (lrb->act == lrb->max)
> +    {
> +      lrb->max *= 2;
> +      lrb->buf = xrealloc (lrb->buf, lrb->max);
> +    }
> +  lrb->buf[lrb->act++] = ch;
> +}

OK. Matches original addc macro.

>  
> +/* Adds L bytes at S to *LRB.  */
> +static void
> +adds (struct lr_buffer *lrb, const unsigned char *s, size_t l)
> +{
> +  if (lrb->max - lrb->act < l)
> +    {
> +      size_t required_size = lrb->act + l;
> +      size_t new_max = 2 * lrb->max;
> +      if (new_max < required_size)
> +	new_max = required_size;
> +      lrb->buf = xrealloc (lrb->buf, new_max);
> +      lrb->max = new_max;
> +    }
> +  memcpy (lrb->buf + lrb->act, s, l);
> +  lrb->act += l;
> +}

OK. Matches original adds macro.

>  
>  #define ADDWC(ch) \
>    do									      \
> @@ -467,13 +491,11 @@ get_symname (struct linereader *lr)
>       1. reserved words
>       2. ISO 10646 position values
>       3. all other.  */
> -  char *buf;
> -  size_t bufact = 0;
> -  size_t bufmax = 56;

OK.

>    const struct keyword_t *kw;
>    int ch;
> +  struct lr_buffer lrb;
>  
> -  buf = (char *) xmalloc (bufmax);
> +  lr_buffer_init (&lrb);

OK.

>  
>    do
>      {
> @@ -481,13 +503,13 @@ get_symname (struct linereader *lr)
>        if (ch == lr->escape_char)
>  	{
>  	  int c2 = lr_getc (lr);
> -	  ADDC (c2);
> +	  addc (&lrb, c2);

OK.

>  
>  	  if (c2 == '\n')
>  	    ch = '\n';
>  	}
>        else
> -	ADDC (ch);
> +	addc (&lrb, ch);

OK.

>      }
>    while (ch != '>' && ch != '\n');
>  
> @@ -495,39 +517,35 @@ get_symname (struct linereader *lr)
>      lr_error (lr, _("unterminated symbolic name"));
>  
>    /* Test for ISO 10646 position value.  */
> -  if (buf[0] == 'U' && (bufact == 6 || bufact == 10))
> +  if (lrb.buf[0] == 'U' && (lrb.act == 6 || lrb.act == 10))

OK.

>      {
> -      char *cp = buf + 1;
> -      while (cp < &buf[bufact - 1] && isxdigit (*cp))
> +      char *cp = lrb.buf + 1;
> +      while (cp < &lrb.buf[lrb.act - 1] && isxdigit (*cp))

OK.

>  	++cp;
>  
> -      if (cp == &buf[bufact - 1])
> +      if (cp == &lrb.buf[lrb.act - 1])

OK.

>  	{
>  	  /* Yes, it is.  */
>  	  lr->token.tok = tok_ucs4;
> -	  lr->token.val.ucs4 = strtoul (buf + 1, NULL, 16);
> +	  lr->token.val.ucs4 = strtoul (lrb.buf + 1, NULL, 16);

OK.

>  
>  	  return &lr->token;
>  	}
>      }
>  
>    /* It is a symbolic name.  Test for reserved words.  */
> -  kw = lr->hash_fct (buf, bufact - 1);
> +  kw = lr->hash_fct (lrb.buf, lrb.act - 1);

OK.

>  
>    if (kw != NULL && kw->symname_or_ident == 1)
>      {
>        lr->token.tok = kw->token;
> -      free (buf);
> +      free (lrb.buf);

OK. We have an init function but then directly free lrb.buf. This is OK, I wonder if
there would be any use for lr_buffer_free () with some checking.

>      }
>    else
>      {
>        lr->token.tok = tok_bsymbol;
> -
> -      buf = xrealloc (buf, bufact + 1);
> -      buf[bufact] = '\0';
> -
> -      lr->token.val.str.startmb = buf;
> -      lr->token.val.str.lenmb = bufact - 1;
> +      lr_buffer_to_token (&lrb, lr);
> +      --lr->token.val.str.lenmb;  /* Hide the training '>'.  */

OK.

>      }
>  
>    return &lr->token;
> @@ -537,16 +555,13 @@ get_symname (struct linereader *lr)
>  static struct token *
>  get_ident (struct linereader *lr)
>  {
> -  char *buf;
> -  size_t bufact;
> -  size_t bufmax = 56;
>    const struct keyword_t *kw;
>    int ch;
> +  struct lr_buffer lrb;
>  
> -  buf = xmalloc (bufmax);
> -  bufact = 0;
> +  lr_buffer_init (&lrb);
>  
> -  ADDC (lr->buf[lr->idx - 1]);
> +  addc (&lrb, lr->buf[lr->idx - 1]);
>  
>    while (!isspace ((ch = lr_getc (lr))) && ch != '"' && ch != ';'
>  	 && ch != '<' && ch != ',' && ch != EOF)
> @@ -560,27 +575,22 @@ get_ident (struct linereader *lr)
>  	      break;
>  	    }
>  	}
> -      ADDC (ch);
> +      addc (&lrb, ch);
>      }
>  
>    lr_ungetc (lr, ch);
>  
> -  kw = lr->hash_fct (buf, bufact);
> +  kw = lr->hash_fct (lrb.buf, lrb.act);
>  
>    if (kw != NULL && kw->symname_or_ident == 0)
>      {
>        lr->token.tok = kw->token;
> -      free (buf);
> +      free (lrb.buf);

OK.

>      }
>    else
>      {
>        lr->token.tok = tok_ident;
> -
> -      buf = xrealloc (buf, bufact + 1);
> -      buf[bufact] = '\0';
> -
> -      lr->token.val.str.startmb = buf;
> -      lr->token.val.str.lenmb = bufact;
> +      lr_buffer_to_token (&lrb, lr);

OK.

>      }
>  
>    return &lr->token;
> @@ -593,14 +603,10 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  	    int verbose)
>  {
>    int return_widestr = lr->return_widestr;
> -  char *buf;
> +  struct lr_buffer lrb;
>    wchar_t *buf2 = NULL;
> -  size_t bufact;
> -  size_t bufmax = 56;
>  
> -  /* We must return two different strings.  */
> -  buf = xmalloc (bufmax);
> -  bufact = 0;
> +  lr_buffer_init (&lrb);

OK.

>  
>    /* We know it'll be a string.  */
>    lr->token.tok = tok_string;
> @@ -613,19 +619,19 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  
>        buf2 = NULL;
>        while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
> -	ADDC (ch);
> +	addc (&lrb, ch);
>  
>        /* Catch errors with trailing escape character.  */
> -      if (bufact > 0 && buf[bufact - 1] == lr->escape_char
> -	  && (bufact == 1 || buf[bufact - 2] != lr->escape_char))
> +      if (lrb.act > 0 && lrb.buf[lrb.act - 1] == lr->escape_char
> +	  && (lrb.act == 1 || lrb.buf[lrb.act - 2] != lr->escape_char))
>  	{
>  	  lr_error (lr, _("illegal escape sequence at end of string"));
> -	  --bufact;
> +	  --lrb.act;
>  	}
>        else if (ch == '\n' || ch == EOF)
>  	lr_error (lr, _("unterminated string"));
>  
> -      ADDC ('\0');
> +      addc (&lrb, '\0');
>      }
>    else
>      {
> @@ -662,7 +668,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  		    break;
>  		}
>  
> -	      ADDC (ch);
> +	      addc (&lrb, ch);
>  	      if (return_widestr)
>  		ADDWC ((uint32_t) ch);
>  
> @@ -671,7 +677,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  
>  	  /* Now we have to search for the end of the symbolic name, i.e.,
>  	     the closing '>'.  */
> -	  startidx = bufact;
> +	  startidx = lrb.act;

OK.

>  	  while ((ch = lr_getc (lr)) != '>' && ch != '\n' && ch != EOF)
>  	    {
>  	      if (ch == lr->escape_char)
> @@ -680,12 +686,12 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  		  if (ch == '\n' || ch == EOF)
>  		    break;
>  		}
> -	      ADDC (ch);
> +	      addc (&lrb, ch);
>  	    }
>  	  if (ch == '\n' || ch == EOF)
>  	    /* Not a correct string.  */
>  	    break;
> -	  if (bufact == startidx)
> +	  if (lrb.act == startidx)
>  	    {
>  	      /* <> is no correct name.  Ignore it and also signal an
>  		 error.  */
> @@ -694,23 +700,23 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  	    }
>  
>  	  /* It might be a Uxxxx symbol.  */
> -	  if (buf[startidx] == 'U'
> -	      && (bufact - startidx == 5 || bufact - startidx == 9))
> +	  if (lrb.buf[startidx] == 'U'
> +	      && (lrb.act - startidx == 5 || lrb.act - startidx == 9))
>  	    {
> -	      char *cp = buf + startidx + 1;
> -	      while (cp < &buf[bufact] && isxdigit (*cp))
> +	      char *cp = lrb.buf + startidx + 1;
> +	      while (cp < &lrb.buf[lrb.act] && isxdigit (*cp))
>  		++cp;
>  
> -	      if (cp == &buf[bufact])
> +	      if (cp == &lrb.buf[lrb.act])
>  		{
>  		  char utmp[10];
>  
>  		  /* Yes, it is.  */
> -		  ADDC ('\0');
> -		  wch = strtoul (buf + startidx + 1, NULL, 16);
> +		  addc (&lrb, '\0');
> +		  wch = strtoul (lrb.buf + startidx + 1, NULL, 16);
>  
>  		  /* Now forget about the name we just added.  */
> -		  bufact = startidx;
> +		  lrb.act = startidx;
>  
>  		  if (return_widestr)
>  		    ADDWC (wch);
> @@ -774,7 +780,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  				      seq = charmap_find_value (charmap, utmp,
>  								9);
>  				      assert (seq != NULL);
> -				      ADDS (seq->bytes, seq->nbytes);
> +				      adds (&lrb, seq->bytes, seq->nbytes);
>  				    }
>  
>  				  continue;
> @@ -788,24 +794,24 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  		    }
>  
>  		  if (seq != NULL)
> -		    ADDS (seq->bytes, seq->nbytes);
> +		    adds (&lrb, seq->bytes, seq->nbytes);
>  
>  		  continue;
>  		}
>  	    }
>  
> -	  /* We now have the symbolic name in buf[startidx] to
> -	     buf[bufact-1].  Now find out the value for this character
> +	  /* We now have the symbolic name in lrb.buf[startidx] to
> +	     lrb.buf[lrb.act-1].  Now find out the value for this character
>  	     in the charmap as well as in the repertoire map (in this
>  	     order).  */
> -	  seq = charmap_find_value (charmap, &buf[startidx],
> -				    bufact - startidx);
> +	  seq = charmap_find_value (charmap, &lrb.buf[startidx],
> +				    lrb.act - startidx);

OK.

>  
>  	  if (seq == NULL)
>  	    {
>  	      /* This name is not in the charmap.  */
>  	      lr_error (lr, _("symbol `%.*s' not in charmap"),
> -			(int) (bufact - startidx), &buf[startidx]);
> +			(int) (lrb.act - startidx), &lrb.buf[startidx]);

OK.

>  	      illegal_string = 1;
>  	    }
>  
> @@ -816,8 +822,8 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  		wch = seq->ucs4;
>  	      else
>  		{
> -		  wch = repertoire_find_value (repertoire, &buf[startidx],
> -					       bufact - startidx);
> +		  wch = repertoire_find_value (repertoire, &lrb.buf[startidx],
> +					       lrb.act - startidx);
>  		  if (seq != NULL)
>  		    seq->ucs4 = wch;
>  		}
> @@ -826,7 +832,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  		{
>  		  /* This name is not in the repertoire map.  */
>  		  lr_error (lr, _("symbol `%.*s' not in repertoire map"),
> -			    (int) (bufact - startidx), &buf[startidx]);
> +			    (int) (lrb.act - startidx), &lrb.buf[startidx]);
>  		  illegal_string = 1;
>  		}
>  	      else
> @@ -834,11 +840,11 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  	    }
>  
>  	  /* Now forget about the name we just added.  */
> -	  bufact = startidx;
> +	  lrb.act = startidx;
>  
>  	  /* And copy the bytes.  */
>  	  if (seq != NULL)
> -	    ADDS (seq->bytes, seq->nbytes);
> +	    adds (&lrb, seq->bytes, seq->nbytes);
>  	}
>  
>        if (ch == '\n' || ch == EOF)
> @@ -849,7 +855,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  
>        if (illegal_string)
>  	{
> -	  free (buf);
> +	  free (lrb.buf);

OK.

>  	  free (buf2);
>  	  lr->token.val.str.startmb = NULL;
>  	  lr->token.val.str.lenmb = 0;
> @@ -859,7 +865,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  	  return &lr->token;
>  	}
>  
> -      ADDC ('\0');
> +      addc (&lrb, '\0');
>  
>        if (return_widestr)
>  	{
> @@ -870,8 +876,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  	}
>      }
>  
> -  lr->token.val.str.startmb = xrealloc (buf, bufact);
> -  lr->token.val.str.lenmb = bufact;
> +  lr_buffer_to_token (&lrb, lr);
>  
>    return &lr->token;
>  }


-- 
Cheers,
Carlos.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 2/5] locale: Fix signed char bug in lr_getc
  2022-05-19 21:06 ` [PATCH 2/5] locale: Fix signed char bug in lr_getc Florian Weimer
@ 2022-07-04 19:54   ` Carlos O'Donell
  0 siblings, 0 replies; 15+ messages in thread
From: Carlos O'Donell @ 2022-07-04 19:54 UTC (permalink / raw)
  To: Florian Weimer, libc-alpha

On 5/19/22 17:06, Florian Weimer via Libc-alpha wrote:
> The array lr->buf contains characters, which can be signed.  A 0xff
> byte in the input could be incorrectly reported as EOF.  More
> importantly, get_string in linereader.c converts a signed input byte
> to a Unicode code point using ADDWC ((uint32_t) ch), under the
> assumption that this decodes the ISO-8859-1 input encoding.  If char
> is signed, this does not give the correct result.  This means that
> ISO-8859-1 input files for localedef are not actually supported,
> contrary to the comment in get_string.  This is a happy accident because
> we can therefore change the file encoding to UTF-8 without impacting
> backwards compatibility.

LGTM.

Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Tested-by: Carlos O'Donell <carlos@redhat.com>

> 
> While at it, remove the \32 check for MS-DOS end-of-file character (^Z).

OK. We don't need this, files should have the correct EOF.

> ---
>  locale/programs/linereader.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/locale/programs/linereader.h b/locale/programs/linereader.h
> index 0fb10ec833..653a71d2d1 100644
> --- a/locale/programs/linereader.h
> +++ b/locale/programs/linereader.h
> @@ -134,7 +134,7 @@ lr_getc (struct linereader *lr)
>  	return EOF;
>      }
>  
> -  return lr->buf[lr->idx] == '\32' ? EOF : lr->buf[lr->idx++];
> +  return lr->buf[lr->idx++] & 0xff;

OK. Agreed, this should not be sign extended. It's a byte in the buffer not EOF.
With the original MS-DOS checking it might have been *needed* to return -1.

>  }
>  
>  


-- 
Cheers,
Carlos.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 3/5] locale: Introduce translate_unicode_codepoint into linereader.c
  2022-05-19 21:06 ` [PATCH 3/5] locale: Introduce translate_unicode_codepoint into linereader.c Florian Weimer
@ 2022-07-04 19:54   ` Carlos O'Donell
  0 siblings, 0 replies; 15+ messages in thread
From: Carlos O'Donell @ 2022-07-04 19:54 UTC (permalink / raw)
  To: Florian Weimer, libc-alpha

On 5/19/22 17:06, Florian Weimer via Libc-alpha wrote:
> This will permit reusing the Unicode character processing for
> different character encodings, not just the current <U...> encoding.

LGTM. Straight forward refactor.

Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Tested-by: Carlos O'Donell <carlos@redhat.com>

> ---
>  locale/programs/linereader.c | 167 ++++++++++++++++++-----------------
>  1 file changed, 85 insertions(+), 82 deletions(-)
> 
> diff --git a/locale/programs/linereader.c b/locale/programs/linereader.c
> index d5367e0a1e..f7292f0102 100644
> --- a/locale/programs/linereader.c
> +++ b/locale/programs/linereader.c
> @@ -596,6 +596,83 @@ get_ident (struct linereader *lr)
>    return &lr->token;
>  }
>  
> +/* Process a decoded Unicode codepoint WCH in a string, placing the
> +   multibyte sequence into LRB.  Return false if the character is not
> +   found in CHARMAP/REPERTOIRE.  */
> +static bool
> +translate_unicode_codepoint (struct localedef_t *locale,
> +			     const struct charmap_t *charmap,
> +			     const struct repertoire_t *repertoire,
> +			     uint32_t wch, struct lr_buffer *lrb)
> +{
> +  /* See whether the charmap contains the Uxxxxxxxx names.  */
> +  char utmp[10];
> +  snprintf (utmp, sizeof (utmp), "U%08X", wch);
> +  struct charseq *seq = charmap_find_value (charmap, utmp, 9);
> +
> +  if (seq == NULL)
> +    {
> +      /* No, this isn't the case.  Now determine from
> +	 the repertoire the name of the character and
> +	 find it in the charmap.  */
> +      if (repertoire != NULL)
> +	{
> +	  const char *symbol = repertoire_find_symbol (repertoire, wch);
> +	  if (symbol != NULL)
> +	    seq = charmap_find_value (charmap, symbol, strlen (symbol));
> +	}
> +
> +      if (seq == NULL)
> +	{
> +#ifndef NO_TRANSLITERATION
> +	  /* Transliterate if possible.  */
> +	  if (locale != NULL)
> +	    {
> +	      if ((locale->avail & CTYPE_LOCALE) == 0)
> +		{
> +		  /* Load the CTYPE data now.  */
> +		  int old_needed = locale->needed;
> +
> +		  locale->needed = 0;
> +		  locale = load_locale (LC_CTYPE, locale->name,
> +					locale->repertoire_name,
> +					charmap, locale);
> +		  locale->needed = old_needed;
> +		}
> +
> +	      uint32_t *translit;
> +	      if ((locale->avail & CTYPE_LOCALE) != 0
> +		  && ((translit = find_translit (locale, charmap, wch))
> +		      != NULL))
> +		/* The CTYPE data contains a matching
> +		   transliteration.  */
> +		{
> +		  for (int i = 0; translit[i] != 0; ++i)
> +		    {
> +		      snprintf (utmp, sizeof (utmp), "U%08X", translit[i]);
> +		      seq = charmap_find_value (charmap, utmp, 9);
> +		      assert (seq != NULL);
> +		      adds (lrb, seq->bytes, seq->nbytes);
> +		    }
> +		  return true;
> +		}
> +	    }
> +#endif	/* NO_TRANSLITERATION */
> +
> +	  /* Not a known name.  */
> +	  return false;
> +	}
> +    }
> +
> +  if (seq != NULL)
> +    {
> +      adds (lrb, seq->bytes, seq->nbytes);
> +      return true;
> +    }
> +  else
> +    return false;
> +}
> +
>  
>  static struct token *
>  get_string (struct linereader *lr, const struct charmap_t *charmap,
> @@ -635,7 +712,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>      }
>    else
>      {
> -      int illegal_string = 0;
> +      bool illegal_string = false;
>        size_t buf2act = 0;
>        size_t buf2max = 56 * sizeof (uint32_t);
>        int ch;
> @@ -695,7 +772,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  	    {
>  	      /* <> is no correct name.  Ignore it and also signal an
>  		 error.  */
> -	      illegal_string = 1;
> +	      illegal_string = true;
>  	      continue;
>  	    }
>  
> @@ -709,8 +786,6 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  
>  	      if (cp == &lrb.buf[lrb.act])
>  		{
> -		  char utmp[10];
> -
>  		  /* Yes, it is.  */
>  		  addc (&lrb, '\0');
>  		  wch = strtoul (lrb.buf + startidx + 1, NULL, 16);
> @@ -721,81 +796,9 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  		  if (return_widestr)
>  		    ADDWC (wch);
>  
> -		  /* See whether the charmap contains the Uxxxxxxxx names.  */
> -		  snprintf (utmp, sizeof (utmp), "U%08X", wch);
> -		  seq = charmap_find_value (charmap, utmp, 9);
> -
> -		  if (seq == NULL)
> -		    {
> -		     /* No, this isn't the case.  Now determine from
> -			the repertoire the name of the character and
> -			find it in the charmap.  */
> -		      if (repertoire != NULL)
> -			{
> -			  const char *symbol;
> -
> -			  symbol = repertoire_find_symbol (repertoire, wch);
> -
> -			  if (symbol != NULL)
> -			    seq = charmap_find_value (charmap, symbol,
> -						      strlen (symbol));
> -			}
> -
> -		      if (seq == NULL)
> -			{
> -#ifndef NO_TRANSLITERATION
> -			  /* Transliterate if possible.  */
> -			  if (locale != NULL)
> -			    {
> -			      uint32_t *translit;
> -
> -			      if ((locale->avail & CTYPE_LOCALE) == 0)
> -				{
> -				  /* Load the CTYPE data now.  */
> -				  int old_needed = locale->needed;
> -
> -				  locale->needed = 0;
> -				  locale = load_locale (LC_CTYPE,
> -							locale->name,
> -							locale->repertoire_name,
> -							charmap, locale);
> -				  locale->needed = old_needed;
> -				}
> -
> -			      if ((locale->avail & CTYPE_LOCALE) != 0
> -				  && ((translit = find_translit (locale,
> -								 charmap, wch))
> -				      != NULL))
> -				/* The CTYPE data contains a matching
> -				   transliteration.  */
> -				{
> -				  int i;
> -
> -				  for (i = 0; translit[i] != 0; ++i)
> -				    {
> -				      char utmp[10];
> -
> -				      snprintf (utmp, sizeof (utmp), "U%08X",
> -						translit[i]);
> -				      seq = charmap_find_value (charmap, utmp,
> -								9);
> -				      assert (seq != NULL);
> -				      adds (&lrb, seq->bytes, seq->nbytes);
> -				    }
> -
> -				  continue;
> -				}
> -			    }
> -#endif	/* NO_TRANSLITERATION */
> -
> -			  /* Not a known name.  */
> -			  illegal_string = 1;
> -			}
> -		    }
> -
> -		  if (seq != NULL)
> -		    adds (&lrb, seq->bytes, seq->nbytes);
> -
> +		  if (!translate_unicode_codepoint (locale, charmap,
> +						    repertoire, wch, &lrb))
> +		    illegal_string = true;

OK. Refactor.

>  		  continue;
>  		}
>  	    }
> @@ -812,7 +815,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  	      /* This name is not in the charmap.  */
>  	      lr_error (lr, _("symbol `%.*s' not in charmap"),
>  			(int) (lrb.act - startidx), &lrb.buf[startidx]);
> -	      illegal_string = 1;
> +	      illegal_string = true;
>  	    }
>  
>  	  if (return_widestr)
> @@ -833,7 +836,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  		  /* This name is not in the repertoire map.  */
>  		  lr_error (lr, _("symbol `%.*s' not in repertoire map"),
>  			    (int) (lrb.act - startidx), &lrb.buf[startidx]);
> -		  illegal_string = 1;
> +		  illegal_string = true;
>  		}
>  	      else
>  		ADDWC (wch);
> @@ -850,7 +853,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>        if (ch == '\n' || ch == EOF)
>  	{
>  	  lr_error (lr, _("unterminated string"));
> -	  illegal_string = 1;
> +	  illegal_string = true;
>  	}
>  
>        if (illegal_string)

OK.

-- 
Cheers,
Carlos.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 4/5] locale: localdef input files are now encoded in UTF-8
  2022-05-19 21:06 ` [PATCH 4/5] locale: localdef input files are now encoded in UTF-8 Florian Weimer
@ 2022-07-04 19:54   ` Carlos O'Donell
  0 siblings, 0 replies; 15+ messages in thread
From: Carlos O'Donell @ 2022-07-04 19:54 UTC (permalink / raw)
  To: Florian Weimer, libc-alpha

On 5/19/22 17:06, Florian Weimer via Libc-alpha wrote:
> Previously, they were assumed to be in ISO-8859-1, and that the output
> charset overlapped with ISO-8859-1 for the characters actually used.
> However, this did not work as intended on many architectures even for
> an ISO-8859-1 output encoding because of the char signedness bug in
> lr_getc.  Therefore, this commit switches to UTF-8 without making
> provisions for backwards compatibility.

That sounds acceptable to me.

LGTM.

Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Tested-by: Carlos O'Donell <carlos@redhat.com>

> 
> The following Elisp code can be used to convert locale definition files
> to UTF-8:
> 
> (defun glibc/convert-localedef (from to)
>   (interactive "r")
>   (save-excursion
>     (save-restriction
>       (narrow-to-region from to)
>       (goto-char (point-min))
>       (save-match-data
> 	(while (re-search-forward "<U\\([0-9a-fA-F]+\\)>" nil t)
> 	  (let* ((codepoint (string-to-number (match-string 1) 16))
> 		 (converted
> 		  (cond
> 		   ((memq codepoint '(?/ ?\ ?< ?>))
> 		    (string ?/ codepoint))
> 		   ((= codepoint ?\") "<U0022>")
> 		   (t (string codepoint)))))
> 	    (replace-match converted t)))))))

OK.

> ---
>  NEWS                         |   4 +
>  locale/programs/linereader.c | 144 ++++++++++++++++++++++++++++++++---
>  2 files changed, 137 insertions(+), 11 deletions(-)
> 
> diff --git a/NEWS b/NEWS
> index ad0c08d8ca..7ce0d8a135 100644
> --- a/NEWS
> +++ b/NEWS
> @@ -20,6 +20,10 @@ Major new features:
>    have been added.  The pidfd functionality provides access to a process
>    while avoiding the issue of PID reuse on tranditional Unix systems.
>  
> +* localedef now accepts locale definition files encoded in UTF-8.
> +  Previously, input bytes not within the ASCII range resulted in
> +  unpredictable output.

OK.

> +
>  Deprecated and removed features, and other changes affecting compatibility:
>  
>  * Support for prelink will be removed in the next release; this includes
> diff --git a/locale/programs/linereader.c b/locale/programs/linereader.c
> index f7292f0102..b484327969 100644
> --- a/locale/programs/linereader.c
> +++ b/locale/programs/linereader.c
> @@ -42,6 +42,7 @@ static struct token *get_string (struct linereader *lr,
>  				 struct localedef_t *locale,
>  				 const struct repertoire_t *repertoire,
>  				 int verbose);
> +static bool utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch);

OK.

>  
>  
>  struct linereader *
> @@ -327,6 +328,17 @@ lr_token (struct linereader *lr, const struct charmap_t *charmap,
>  	}
>        lr_ungetn (lr, 2);
>        break;
> +
> +    case 0x80 ... 0xff:		/* UTF-8 sequence.  */
> +      uint32_t wch;
> +      if (!utf8_decode (lr, ch, &wch))
> +	{
> +	  lr->token.tok = tok_error;
> +	  return &lr->token;
> +	}
> +      lr->token.tok = tok_ucs4;
> +      lr->token.val.ucs4 = wch;
> +      return &lr->token;

OK.

>      }
>  
>    return get_ident (lr);
> @@ -673,6 +685,87 @@ translate_unicode_codepoint (struct localedef_t *locale,
>      return false;
>  }
>  
> +/* Returns true if ch is not EOF (that is, non-negative) and a valid
> +   UTF-8 trailing byte.  */
> +static bool
> +utf8_valid_trailing (int ch)
> +{
> +  return ch >= 0 && (ch & 0xc0) == 0x80;
> +}

OK.

> +
> +/* Reports an error for a broken UTF-8 sequence.  CH2 to CH4 may be
> +   EOF.  Always returns false.  */
> +static bool
> +utf8_sequence_error (struct linereader *lr, uint8_t ch1, int ch2, int ch3,
> +		     int ch4)
> +{
> +  char buf[30];
> +
> +  if (ch2 < 0)
> +    snprintf (buf, sizeof (buf), "0x%02x", ch1);
> +  else if (ch3 < 0)
> +    snprintf (buf, sizeof (buf), "0x%02x 0x%02x", ch1, ch2);
> +  else if (ch4 < 0)
> +    snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x", ch1, ch2, ch3);
> +  else
> +    snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x 0x%02x",
> +	      ch1, ch2, ch3, ch4);
> +
> +  lr_error (lr, _("invalid UTF-8 sequence %s"), buf);
> +  return false;

OK.

> +}
> +
> +/* Reads a UTF-8 sequence from LR, with the leading byte CH1, and
> +   stores the decoded codepoint in *WCH.  Returns false on failure and
> +   reports an error.  */
> +static bool
> +utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch)
> +{
> +  /* See RFC 3629 section 4 and __gconv_transform_utf8_internal.  */
> +  if (ch1 < 0xc2)
> +    return utf8_sequence_error (lr, ch1, -1, -1, -1);

OK. While ASCII goes to 7F the standard says ch2 would be > c2, so cover the whole range here.

> +
> +  int ch2 = lr_getc (lr);
> +  if (!utf8_valid_trailing (ch2))
> +    return utf8_sequence_error (lr, ch1, ch2, -1, -1);
> +
> +  if (ch1 <= 0xdf)
> +    {
> +      uint32_t result = ((ch1 & 0x1f)  << 6) | (ch2 & 0x3f);
> +      if (result < 0x80)
> +	return utf8_sequence_error (lr, ch1, ch2, -1, -1);
> +      *wch = result;
> +      return true;
> +    }

OK.

> +
> +  int ch3 = lr_getc (lr);
> +  if (!utf8_valid_trailing (ch3) || ch1 < 0xe0)
> +    return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
> +
> +  if (ch1 <= 0xef)
> +    {
> +      uint32_t result = (((ch1 & 0x0f)  << 12)
> +			 | ((ch2 & 0x3f) << 6)
> +			 | (ch3 & 0x3f));
> +      if (result < 0x800)
> +	return utf8_sequence_error (lr, ch1, ch2, ch3, -1);
> +      *wch = result;
> +      return true;
> +    }
> +
> +  int ch4 = lr_getc (lr);
> +  if (!utf8_valid_trailing (ch4) || ch1 < 0xf0 || ch1 > 0xf4)
> +    return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
> +
> +  uint32_t result = (((ch1 & 0x07)  << 18)
> +		     | ((ch2 & 0x3f) << 12)
> +		     | ((ch3 & 0x3f) << 6)
> +		     | (ch4 & 0x3f));
> +  if (result < 0x10000)
> +    return utf8_sequence_error (lr, ch1, ch2, ch3, ch4);
> +  *wch = result;
> +  return true;
> +}

OK.

>  
>  static struct token *
>  get_string (struct linereader *lr, const struct charmap_t *charmap,
> @@ -696,7 +789,11 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  
>        buf2 = NULL;
>        while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF)
> -	addc (&lrb, ch);
> +	{
> +	  if (ch >= 0x80)
> +	    lr_error (lr, _("illegal 8-bit character in untranslated string"));
> +	  addc (&lrb, ch);
> +	}

OK.

>  
>        /* Catch errors with trailing escape character.  */
>        if (lrb.act > 0 && lrb.buf[lrb.act - 1] == lr->escape_char
> @@ -730,24 +827,49 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  
>  	  if (ch != '<')
>  	    {
> -	      /* The standards leave it up to the implementation to decide
> -		 what to do with character which stand for themself.  We
> -		 could jump through hoops to find out the value relative to
> -		 the charmap and the repertoire map, but instead we leave
> -		 it up to the locale definition author to write a better
> -		 definition.  We assume here that every character which
> -		 stands for itself is encoded using ISO 8859-1.  Using the
> -		 escape character is allowed.  */
> +	      /* The standards leave it up to the implementation to
> +		 decide what to do with characters which stand for
> +		 themselves.  This implementation treats the input
> +		 file as encoded in UTF-8.  */

OK. This is the right direction.

>  	      if (ch == lr->escape_char)
>  		{
>  		  ch = lr_getc (lr);
> +		  if (ch >= 0x80)
> +		    {
> +		      lr_error (lr, _("illegal 8-bit escape sequence"));
> +		      illegal_string = true;
> +		      break;
> +		    }

OK.

>  		  if (ch == '\n' || ch == EOF)
>  		    break;
> +		  addc (&lrb, ch);
> +		  wch = ch;
> +		}
> +	      else if (ch < 0x80)
> +		{
> +		  wch = ch;
> +		  addc (&lrb, ch);
> +		}
> +	      else 		/* UTF-8 sequence.  */
> +		{
> +		  if (!utf8_decode (lr, ch, &wch))
> +		    {
> +		      illegal_string = true;
> +		      break;
> +		    }
> +		  if (!translate_unicode_codepoint (locale, charmap,
> +						    repertoire, wch, &lrb))
> +		    {
> +		      /* Ignore the rest of the string.  Callers may
> +			 skip this string because it cannot be encoded
> +			 in the output character set.  */
> +		      illegal_string = true;
> +		      continue;
> +		    }
>  		}
>  
> -	      addc (&lrb, ch);
>  	      if (return_widestr)
> -		ADDWC ((uint32_t) ch);
> +		ADDWC (wch);
>  
>  	      continue;
>  	    }


-- 
Cheers,
Carlos.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 5/5] de_DE: Convert to UTF-8
  2022-05-19 21:06 ` [PATCH 5/5] de_DE: Convert to UTF-8 Florian Weimer
@ 2022-07-04 19:54   ` Carlos O'Donell
  2022-07-05  9:27   ` Andreas Schwab
  1 sibling, 0 replies; 15+ messages in thread
From: Carlos O'Donell @ 2022-07-04 19:54 UTC (permalink / raw)
  To: Florian Weimer, libc-alpha

On 5/19/22 17:06, Florian Weimer via Libc-alpha wrote:
> ---

I appreciate that you did a test conversion!

LGTM.

Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Tested-by: Carlos O'Donell <carlos@redhat.com>


>  localedata/locales/de_DE | 32 ++++++++++++++++----------------
>  1 file changed, 16 insertions(+), 16 deletions(-)
> 
> diff --git a/localedata/locales/de_DE b/localedata/locales/de_DE
> index 49a421fff4..52767808f7 100644
> --- a/localedata/locales/de_DE
> +++ b/localedata/locales/de_DE
> @@ -46,36 +46,36 @@ include "translit_combining";""
>  
>  % German umlauts.
>  % LATIN CAPITAL LETTER A WITH DIAERESIS.
> -<U00C4> "A<U0308>";"AE"
> +Ä "Ä";"AE"
>  % LATIN CAPITAL LETTER O WITH DIAERESIS.
> -<U00D6> "O<U0308>";"OE"
> +Ö "Ö";"OE"
>  % LATIN CAPITAL LETTER U WITH DIAERESIS.
> -<U00DC> "U<U0308>";"UE"
> +Ü "Ü";"UE"
>  % LATIN SMALL LETTER A WITH DIAERESIS.
> -<U00E4> "a<U0308>";"ae"
> +ä "ä";"ae"
>  % LATIN SMALL LETTER O WITH DIAERESIS.
> -<U00F6> "o<U0308>";"oe"
> +ö "ö";"oe"
>  % LATIN SMALL LETTER U WITH DIAERESIS.
> -<U00FC> "u<U0308>";"ue"
> +ü "ü";"ue"
>  
>  % Danish.
>  % LATIN CAPITAL LETTER A WITH RING ABOVE.
> -<U00C5> "A<U030A>";"AA"
> +Å "Å";"AA"
>  % LATIN SMALL LETTER A WITH RING ABOVE.
> -<U00E5> "a<U030A>";"aa"
> +å "å";"aa"
>  
>  % The following strange first-level transliteration derive from the use
>  % U201E and U201C as "correct" quoting characters.  These two characters
>  % do not really belong together.  The result is that somebody who uses
>  % U201C and U201D will get the incorrect U00AB / U00BB sequences.
>  % LEFT DOUBLE QUOTATION MARK
> -<U201C> <U00AB>;<U0022>
> +“ «;<U0022>
>  % RIGHT DOUBLE QUOTATION MARK
> -<U201D> <U00BB>;<U0022>
> +” »;<U0022>
>  % DOUBLE LOW-9 QUOTATION MARK
> -<U201E> <U00BB>;"<U002C><U002C>"
> +„ »;",,"
>  % DOUBLE HIGH-REVERSED-9 QUOTATION MARK
> -<U201F> <U00AB>;<U0022>
> +‟ «;<U0022>
>  
>  translit_end
>  
> @@ -90,7 +90,7 @@ END LC_COLLATE
>  
>  LC_MONETARY
>  int_curr_symbol     "EUR "
> -currency_symbol     "<U20AC>"
> +currency_symbol     "€"
>  mon_decimal_point   ","
>  mon_thousands_sep   "."
>  mon_grouping        3;3
> @@ -126,14 +126,14 @@ day	"Sonntag";/
>  	"Freitag";/
>  	"Samstag"
>  abmon	"Jan";"Feb";/
> -	"M<U00E4>r";"Apr";/
> +	"Mär";"Apr";/
>  	"Mai";"Jun";/
>  	"Jul";"Aug";/
>  	"Sep";"Okt";/
>  	"Nov";"Dez"
>  mon	"Januar";/
>  	"Februar";/
> -	"M<U00E4>rz";/
> +	"März";/
>  	"April";/
>  	"Mai";/
>  	"Juni";/
> @@ -172,7 +172,7 @@ END LC_PAPER
>  
>  LC_NAME
>  name_fmt    "%d%t%g%t%m%t%f"
> -name_miss   "Fr<U00E4>ulein"
> +name_miss   "Fräulein"
>  name_mr     "Herr"
>  name_mrs    "Frau"
>  name_ms     "Frau"


-- 
Cheers,
Carlos.


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 5/5] de_DE: Convert to UTF-8
  2022-05-19 21:06 ` [PATCH 5/5] de_DE: Convert to UTF-8 Florian Weimer
  2022-07-04 19:54   ` Carlos O'Donell
@ 2022-07-05  9:27   ` Andreas Schwab
  2022-07-05  9:55     ` Florian Weimer
  1 sibling, 1 reply; 15+ messages in thread
From: Andreas Schwab @ 2022-07-05  9:27 UTC (permalink / raw)
  To: Florian Weimer via Libc-alpha; +Cc: Florian Weimer

On Mai 19 2022, Florian Weimer via Libc-alpha wrote:

> diff --git a/localedata/locales/de_DE b/localedata/locales/de_DE
> index 49a421fff4..52767808f7 100644
> --- a/localedata/locales/de_DE
> +++ b/localedata/locales/de_DE
> @@ -46,36 +46,36 @@ include "translit_combining";""
>  
>  % German umlauts.
>  % LATIN CAPITAL LETTER A WITH DIAERESIS.
> -<U00C4> "A<U0308>";"AE"
> +Ä "Ä";"AE"
>  % LATIN CAPITAL LETTER O WITH DIAERESIS.
> -<U00D6> "O<U0308>";"OE"
> +Ö "Ö";"OE"
>  % LATIN CAPITAL LETTER U WITH DIAERESIS.
> -<U00DC> "U<U0308>";"UE"
> +Ü "Ü";"UE"
>  % LATIN SMALL LETTER A WITH DIAERESIS.
> -<U00E4> "a<U0308>";"ae"
> +ä "ä";"ae"
>  % LATIN SMALL LETTER O WITH DIAERESIS.
> -<U00F6> "o<U0308>";"oe"
> +ö "ö";"oe"
>  % LATIN SMALL LETTER U WITH DIAERESIS.
> -<U00FC> "u<U0308>";"ue"
> +ü "ü";"ue"
>  
>  % Danish.
>  % LATIN CAPITAL LETTER A WITH RING ABOVE.
> -<U00C5> "A<U030A>";"AA"
> +Å "Å";"AA"
>  % LATIN SMALL LETTER A WITH RING ABOVE.
> -<U00E5> "a<U030A>";"aa"
> +å "å";"aa"

This has the danger of an overeager editor converting between NFC and
NFD.

-- 
Andreas Schwab, SUSE Labs, schwab@suse.de
GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE  1748 E4D4 88E3 0EEA B9D7
"And now for something completely different."

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 5/5] de_DE: Convert to UTF-8
  2022-07-05  9:27   ` Andreas Schwab
@ 2022-07-05  9:55     ` Florian Weimer
  2022-07-05 10:38       ` Andreas Schwab
  0 siblings, 1 reply; 15+ messages in thread
From: Florian Weimer @ 2022-07-05  9:55 UTC (permalink / raw)
  To: Andreas Schwab; +Cc: Florian Weimer via Libc-alpha

* Andreas Schwab:

>> -<U00E5> "a<U030A>";"aa"
>> +å "å";"aa"
>
> This has the danger of an overeager editor converting between NFC and
> NFD.

Wouldn't we catch this during patch review, similar to a spurious
whitespace change?

(And I think that ideally, normalization of combining characters would
be handled by the collation framework, and not within the data.)

Thanks,
Florian


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH 5/5] de_DE: Convert to UTF-8
  2022-07-05  9:55     ` Florian Weimer
@ 2022-07-05 10:38       ` Andreas Schwab
  0 siblings, 0 replies; 15+ messages in thread
From: Andreas Schwab @ 2022-07-05 10:38 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Florian Weimer via Libc-alpha

On Jul 05 2022, Florian Weimer wrote:

> * Andreas Schwab:
>
>>> -<U00E5> "a<U030A>";"aa"
>>> +å "å";"aa"
>>
>> This has the danger of an overeager editor converting between NFC and
>> NFD.
>
> Wouldn't we catch this during patch review, similar to a spurious
> whitespace change?

If all reviewers are careful enough to use diff-mode.

-- 
Andreas Schwab, SUSE Labs, schwab@suse.de
GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE  1748 E4D4 88E3 0EEA B9D7
"And now for something completely different."

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2022-07-05 10:38 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-19 21:06 [PATCH 0/5] Assume UTF-8 encoding for localedef input files Florian Weimer
2022-05-19 21:06 ` [PATCH 1/5] locale: Turn ADDC and ADDS into functions in linereader.c Florian Weimer
2022-07-04 19:54   ` Carlos O'Donell
2022-05-19 21:06 ` [PATCH 2/5] locale: Fix signed char bug in lr_getc Florian Weimer
2022-07-04 19:54   ` Carlos O'Donell
2022-05-19 21:06 ` [PATCH 3/5] locale: Introduce translate_unicode_codepoint into linereader.c Florian Weimer
2022-07-04 19:54   ` Carlos O'Donell
2022-05-19 21:06 ` [PATCH 4/5] locale: localdef input files are now encoded in UTF-8 Florian Weimer
2022-07-04 19:54   ` Carlos O'Donell
2022-05-19 21:06 ` [PATCH 5/5] de_DE: Convert to UTF-8 Florian Weimer
2022-07-04 19:54   ` Carlos O'Donell
2022-07-05  9:27   ` Andreas Schwab
2022-07-05  9:55     ` Florian Weimer
2022-07-05 10:38       ` Andreas Schwab
2022-07-04 19:54 ` [PATCH 0/5] Assume UTF-8 encoding for localedef input files Carlos O'Donell

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).