From: Florian Weimer <fweimer@redhat.com>
To: libc-alpha@sourceware.org
Subject: [PATCH 3/5] locale: Introduce translate_unicode_codepoint into linereader.c
Date: Thu, 19 May 2022 23:06:38 +0200 [thread overview]
Message-ID: <a89cee054d28d43cf8f7e5f171e876326e4af96e.1652994079.git.fweimer@redhat.com> (raw)
In-Reply-To: <cover.1652994079.git.fweimer@redhat.com>
This will permit reusing the Unicode character processing for
different character encodings, not just the current <U...> encoding.
---
locale/programs/linereader.c | 167 ++++++++++++++++++-----------------
1 file changed, 85 insertions(+), 82 deletions(-)
diff --git a/locale/programs/linereader.c b/locale/programs/linereader.c
index d5367e0a1e..f7292f0102 100644
--- a/locale/programs/linereader.c
+++ b/locale/programs/linereader.c
@@ -596,6 +596,83 @@ get_ident (struct linereader *lr)
return &lr->token;
}
+/* Process a decoded Unicode codepoint WCH in a string, placing the
+ multibyte sequence into LRB. Return false if the character is not
+ found in CHARMAP/REPERTOIRE. */
+static bool
+translate_unicode_codepoint (struct localedef_t *locale,
+ const struct charmap_t *charmap,
+ const struct repertoire_t *repertoire,
+ uint32_t wch, struct lr_buffer *lrb)
+{
+ /* See whether the charmap contains the Uxxxxxxxx names. */
+ char utmp[10];
+ snprintf (utmp, sizeof (utmp), "U%08X", wch);
+ struct charseq *seq = charmap_find_value (charmap, utmp, 9);
+
+ if (seq == NULL)
+ {
+ /* No, this isn't the case. Now determine from
+ the repertoire the name of the character and
+ find it in the charmap. */
+ if (repertoire != NULL)
+ {
+ const char *symbol = repertoire_find_symbol (repertoire, wch);
+ if (symbol != NULL)
+ seq = charmap_find_value (charmap, symbol, strlen (symbol));
+ }
+
+ if (seq == NULL)
+ {
+#ifndef NO_TRANSLITERATION
+ /* Transliterate if possible. */
+ if (locale != NULL)
+ {
+ if ((locale->avail & CTYPE_LOCALE) == 0)
+ {
+ /* Load the CTYPE data now. */
+ int old_needed = locale->needed;
+
+ locale->needed = 0;
+ locale = load_locale (LC_CTYPE, locale->name,
+ locale->repertoire_name,
+ charmap, locale);
+ locale->needed = old_needed;
+ }
+
+ uint32_t *translit;
+ if ((locale->avail & CTYPE_LOCALE) != 0
+ && ((translit = find_translit (locale, charmap, wch))
+ != NULL))
+ /* The CTYPE data contains a matching
+ transliteration. */
+ {
+ for (int i = 0; translit[i] != 0; ++i)
+ {
+ snprintf (utmp, sizeof (utmp), "U%08X", translit[i]);
+ seq = charmap_find_value (charmap, utmp, 9);
+ assert (seq != NULL);
+ adds (lrb, seq->bytes, seq->nbytes);
+ }
+ return true;
+ }
+ }
+#endif /* NO_TRANSLITERATION */
+
+ /* Not a known name. */
+ return false;
+ }
+ }
+
+ if (seq != NULL)
+ {
+ adds (lrb, seq->bytes, seq->nbytes);
+ return true;
+ }
+ else
+ return false;
+}
+
static struct token *
get_string (struct linereader *lr, const struct charmap_t *charmap,
@@ -635,7 +712,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
}
else
{
- int illegal_string = 0;
+ bool illegal_string = false;
size_t buf2act = 0;
size_t buf2max = 56 * sizeof (uint32_t);
int ch;
@@ -695,7 +772,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
{
/* <> is no correct name. Ignore it and also signal an
error. */
- illegal_string = 1;
+ illegal_string = true;
continue;
}
@@ -709,8 +786,6 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
if (cp == &lrb.buf[lrb.act])
{
- char utmp[10];
-
/* Yes, it is. */
addc (&lrb, '\0');
wch = strtoul (lrb.buf + startidx + 1, NULL, 16);
@@ -721,81 +796,9 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
if (return_widestr)
ADDWC (wch);
- /* See whether the charmap contains the Uxxxxxxxx names. */
- snprintf (utmp, sizeof (utmp), "U%08X", wch);
- seq = charmap_find_value (charmap, utmp, 9);
-
- if (seq == NULL)
- {
- /* No, this isn't the case. Now determine from
- the repertoire the name of the character and
- find it in the charmap. */
- if (repertoire != NULL)
- {
- const char *symbol;
-
- symbol = repertoire_find_symbol (repertoire, wch);
-
- if (symbol != NULL)
- seq = charmap_find_value (charmap, symbol,
- strlen (symbol));
- }
-
- if (seq == NULL)
- {
-#ifndef NO_TRANSLITERATION
- /* Transliterate if possible. */
- if (locale != NULL)
- {
- uint32_t *translit;
-
- if ((locale->avail & CTYPE_LOCALE) == 0)
- {
- /* Load the CTYPE data now. */
- int old_needed = locale->needed;
-
- locale->needed = 0;
- locale = load_locale (LC_CTYPE,
- locale->name,
- locale->repertoire_name,
- charmap, locale);
- locale->needed = old_needed;
- }
-
- if ((locale->avail & CTYPE_LOCALE) != 0
- && ((translit = find_translit (locale,
- charmap, wch))
- != NULL))
- /* The CTYPE data contains a matching
- transliteration. */
- {
- int i;
-
- for (i = 0; translit[i] != 0; ++i)
- {
- char utmp[10];
-
- snprintf (utmp, sizeof (utmp), "U%08X",
- translit[i]);
- seq = charmap_find_value (charmap, utmp,
- 9);
- assert (seq != NULL);
- adds (&lrb, seq->bytes, seq->nbytes);
- }
-
- continue;
- }
- }
-#endif /* NO_TRANSLITERATION */
-
- /* Not a known name. */
- illegal_string = 1;
- }
- }
-
- if (seq != NULL)
- adds (&lrb, seq->bytes, seq->nbytes);
-
+ if (!translate_unicode_codepoint (locale, charmap,
+ repertoire, wch, &lrb))
+ illegal_string = true;
continue;
}
}
@@ -812,7 +815,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
/* This name is not in the charmap. */
lr_error (lr, _("symbol `%.*s' not in charmap"),
(int) (lrb.act - startidx), &lrb.buf[startidx]);
- illegal_string = 1;
+ illegal_string = true;
}
if (return_widestr)
@@ -833,7 +836,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
/* This name is not in the repertoire map. */
lr_error (lr, _("symbol `%.*s' not in repertoire map"),
(int) (lrb.act - startidx), &lrb.buf[startidx]);
- illegal_string = 1;
+ illegal_string = true;
}
else
ADDWC (wch);
@@ -850,7 +853,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
if (ch == '\n' || ch == EOF)
{
lr_error (lr, _("unterminated string"));
- illegal_string = 1;
+ illegal_string = true;
}
if (illegal_string)
--
2.35.3
next prev parent reply other threads:[~2022-05-19 21:06 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-05-19 21:06 [PATCH 0/5] Assume UTF-8 encoding for localedef input files Florian Weimer
2022-05-19 21:06 ` [PATCH 1/5] locale: Turn ADDC and ADDS into functions in linereader.c Florian Weimer
2022-07-04 19:54 ` Carlos O'Donell
2022-05-19 21:06 ` [PATCH 2/5] locale: Fix signed char bug in lr_getc Florian Weimer
2022-07-04 19:54 ` Carlos O'Donell
2022-05-19 21:06 ` Florian Weimer [this message]
2022-07-04 19:54 ` [PATCH 3/5] locale: Introduce translate_unicode_codepoint into linereader.c Carlos O'Donell
2022-05-19 21:06 ` [PATCH 4/5] locale: localdef input files are now encoded in UTF-8 Florian Weimer
2022-07-04 19:54 ` Carlos O'Donell
2022-05-19 21:06 ` [PATCH 5/5] de_DE: Convert to UTF-8 Florian Weimer
2022-07-04 19:54 ` Carlos O'Donell
2022-07-05 9:27 ` Andreas Schwab
2022-07-05 9:55 ` Florian Weimer
2022-07-05 10:38 ` Andreas Schwab
2022-07-04 19:54 ` [PATCH 0/5] Assume UTF-8 encoding for localedef input files Carlos O'Donell
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=a89cee054d28d43cf8f7e5f171e876326e4af96e.1652994079.git.fweimer@redhat.com \
--to=fweimer@redhat.com \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).