From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 2178) id F3FDB38582AD; Tue, 5 Jul 2022 07:29:46 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org F3FDB38582AD Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Florian Weimer To: glibc-cvs@sourceware.org Subject: [glibc] locale: localdef input files are now encoded in UTF-8 X-Act-Checkin: glibc X-Git-Author: Florian Weimer X-Git-Refname: refs/heads/master X-Git-Oldrev: 7dcaabb94caa00c9dd68a207ea62fef5a2551ac4 X-Git-Newrev: b15538d77c6a7893c8bb42831dcd3a1a12b727d4 Message-Id: <20220705072946.F3FDB38582AD@sourceware.org> Date: Tue, 5 Jul 2022 07:29:46 +0000 (GMT) X-BeenThere: glibc-cvs@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Glibc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 05 Jul 2022 07:29:47 -0000 https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=b15538d77c6a7893c8bb42831dcd3a1a12b727d4 commit b15538d77c6a7893c8bb42831dcd3a1a12b727d4 Author: Florian Weimer Date: Tue Jul 5 09:05:45 2022 +0200 locale: localdef input files are now encoded in UTF-8 Previously, they were assumed to be in ISO-8859-1, and that the output charset overlapped with ISO-8859-1 for the characters actually used. However, this did not work as intended on many architectures even for an ISO-8859-1 output encoding because of the char signedness bug in lr_getc. Therefore, this commit switches to UTF-8 without making provisions for backwards compatibility. The following Elisp code can be used to convert locale definition files to UTF-8: (defun glibc/convert-localedef (from to) (interactive "r") (save-excursion (save-restriction (narrow-to-region from to) (goto-char (point-min)) (save-match-data (while (re-search-forward "" nil t) (let* ((codepoint (string-to-number (match-string 1) 16)) (converted (cond ((memq codepoint '(?/ ?\ ?< ?>)) (string ?/ codepoint)) ((= codepoint ?\") "") (t (string codepoint))))) (replace-match converted t))))))) Reviewed-by: Carlos O'Donell Tested-by: Carlos O'Donell Diff: --- NEWS | 4 ++ locale/programs/linereader.c | 144 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 137 insertions(+), 11 deletions(-) diff --git a/NEWS b/NEWS index b0a3d7e512..163bcfad2e 100644 --- a/NEWS +++ b/NEWS @@ -46,6 +46,10 @@ Major new features: to more flexibly configure and operate on filesystem mounts. The new mount APIs are specifically designed to work with namespaces. +* localedef now accepts locale definition files encoded in UTF-8. + Previously, input bytes not within the ASCII range resulted in + unpredictable output. + Deprecated and removed features, and other changes affecting compatibility: * Support for prelink will be removed in the next release; this includes diff --git a/locale/programs/linereader.c b/locale/programs/linereader.c index f7292f0102..b484327969 100644 --- a/locale/programs/linereader.c +++ b/locale/programs/linereader.c @@ -42,6 +42,7 @@ static struct token *get_string (struct linereader *lr, struct localedef_t *locale, const struct repertoire_t *repertoire, int verbose); +static bool utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch); struct linereader * @@ -327,6 +328,17 @@ lr_token (struct linereader *lr, const struct charmap_t *charmap, } lr_ungetn (lr, 2); break; + + case 0x80 ... 0xff: /* UTF-8 sequence. */ + uint32_t wch; + if (!utf8_decode (lr, ch, &wch)) + { + lr->token.tok = tok_error; + return &lr->token; + } + lr->token.tok = tok_ucs4; + lr->token.val.ucs4 = wch; + return &lr->token; } return get_ident (lr); @@ -673,6 +685,87 @@ translate_unicode_codepoint (struct localedef_t *locale, return false; } +/* Returns true if ch is not EOF (that is, non-negative) and a valid + UTF-8 trailing byte. */ +static bool +utf8_valid_trailing (int ch) +{ + return ch >= 0 && (ch & 0xc0) == 0x80; +} + +/* Reports an error for a broken UTF-8 sequence. CH2 to CH4 may be + EOF. Always returns false. */ +static bool +utf8_sequence_error (struct linereader *lr, uint8_t ch1, int ch2, int ch3, + int ch4) +{ + char buf[30]; + + if (ch2 < 0) + snprintf (buf, sizeof (buf), "0x%02x", ch1); + else if (ch3 < 0) + snprintf (buf, sizeof (buf), "0x%02x 0x%02x", ch1, ch2); + else if (ch4 < 0) + snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x", ch1, ch2, ch3); + else + snprintf (buf, sizeof (buf), "0x%02x 0x%02x 0x%02x 0x%02x", + ch1, ch2, ch3, ch4); + + lr_error (lr, _("invalid UTF-8 sequence %s"), buf); + return false; +} + +/* Reads a UTF-8 sequence from LR, with the leading byte CH1, and + stores the decoded codepoint in *WCH. Returns false on failure and + reports an error. */ +static bool +utf8_decode (struct linereader *lr, uint8_t ch1, uint32_t *wch) +{ + /* See RFC 3629 section 4 and __gconv_transform_utf8_internal. */ + if (ch1 < 0xc2) + return utf8_sequence_error (lr, ch1, -1, -1, -1); + + int ch2 = lr_getc (lr); + if (!utf8_valid_trailing (ch2)) + return utf8_sequence_error (lr, ch1, ch2, -1, -1); + + if (ch1 <= 0xdf) + { + uint32_t result = ((ch1 & 0x1f) << 6) | (ch2 & 0x3f); + if (result < 0x80) + return utf8_sequence_error (lr, ch1, ch2, -1, -1); + *wch = result; + return true; + } + + int ch3 = lr_getc (lr); + if (!utf8_valid_trailing (ch3) || ch1 < 0xe0) + return utf8_sequence_error (lr, ch1, ch2, ch3, -1); + + if (ch1 <= 0xef) + { + uint32_t result = (((ch1 & 0x0f) << 12) + | ((ch2 & 0x3f) << 6) + | (ch3 & 0x3f)); + if (result < 0x800) + return utf8_sequence_error (lr, ch1, ch2, ch3, -1); + *wch = result; + return true; + } + + int ch4 = lr_getc (lr); + if (!utf8_valid_trailing (ch4) || ch1 < 0xf0 || ch1 > 0xf4) + return utf8_sequence_error (lr, ch1, ch2, ch3, ch4); + + uint32_t result = (((ch1 & 0x07) << 18) + | ((ch2 & 0x3f) << 12) + | ((ch3 & 0x3f) << 6) + | (ch4 & 0x3f)); + if (result < 0x10000) + return utf8_sequence_error (lr, ch1, ch2, ch3, ch4); + *wch = result; + return true; +} static struct token * get_string (struct linereader *lr, const struct charmap_t *charmap, @@ -696,7 +789,11 @@ get_string (struct linereader *lr, const struct charmap_t *charmap, buf2 = NULL; while ((ch = lr_getc (lr)) != '"' && ch != '\n' && ch != EOF) - addc (&lrb, ch); + { + if (ch >= 0x80) + lr_error (lr, _("illegal 8-bit character in untranslated string")); + addc (&lrb, ch); + } /* Catch errors with trailing escape character. */ if (lrb.act > 0 && lrb.buf[lrb.act - 1] == lr->escape_char @@ -730,24 +827,49 @@ get_string (struct linereader *lr, const struct charmap_t *charmap, if (ch != '<') { - /* The standards leave it up to the implementation to decide - what to do with character which stand for themself. We - could jump through hoops to find out the value relative to - the charmap and the repertoire map, but instead we leave - it up to the locale definition author to write a better - definition. We assume here that every character which - stands for itself is encoded using ISO 8859-1. Using the - escape character is allowed. */ + /* The standards leave it up to the implementation to + decide what to do with characters which stand for + themselves. This implementation treats the input + file as encoded in UTF-8. */ if (ch == lr->escape_char) { ch = lr_getc (lr); + if (ch >= 0x80) + { + lr_error (lr, _("illegal 8-bit escape sequence")); + illegal_string = true; + break; + } if (ch == '\n' || ch == EOF) break; + addc (&lrb, ch); + wch = ch; + } + else if (ch < 0x80) + { + wch = ch; + addc (&lrb, ch); + } + else /* UTF-8 sequence. */ + { + if (!utf8_decode (lr, ch, &wch)) + { + illegal_string = true; + break; + } + if (!translate_unicode_codepoint (locale, charmap, + repertoire, wch, &lrb)) + { + /* Ignore the rest of the string. Callers may + skip this string because it cannot be encoded + in the output character set. */ + illegal_string = true; + continue; + } } - addc (&lrb, ch); if (return_widestr) - ADDWC ((uint32_t) ch); + ADDWC (wch); continue; }