From: Carlos O'Donell <carlos@redhat.com>
To: Florian Weimer <fweimer@redhat.com>, libc-alpha@sourceware.org
Subject: Re: [PATCH 3/5] locale: Introduce translate_unicode_codepoint into linereader.c
Date: Mon, 4 Jul 2022 15:54:15 -0400 [thread overview]
Message-ID: <b7235300-0704-065f-689b-8ac074230522@redhat.com> (raw)
In-Reply-To: <a89cee054d28d43cf8f7e5f171e876326e4af96e.1652994079.git.fweimer@redhat.com>
On 5/19/22 17:06, Florian Weimer via Libc-alpha wrote:
> This will permit reusing the Unicode character processing for
> different character encodings, not just the current <U...> encoding.
LGTM. Straight forward refactor.
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Tested-by: Carlos O'Donell <carlos@redhat.com>
> ---
> locale/programs/linereader.c | 167 ++++++++++++++++++-----------------
> 1 file changed, 85 insertions(+), 82 deletions(-)
>
> diff --git a/locale/programs/linereader.c b/locale/programs/linereader.c
> index d5367e0a1e..f7292f0102 100644
> --- a/locale/programs/linereader.c
> +++ b/locale/programs/linereader.c
> @@ -596,6 +596,83 @@ get_ident (struct linereader *lr)
> return &lr->token;
> }
>
> +/* Process a decoded Unicode codepoint WCH in a string, placing the
> + multibyte sequence into LRB. Return false if the character is not
> + found in CHARMAP/REPERTOIRE. */
> +static bool
> +translate_unicode_codepoint (struct localedef_t *locale,
> + const struct charmap_t *charmap,
> + const struct repertoire_t *repertoire,
> + uint32_t wch, struct lr_buffer *lrb)
> +{
> + /* See whether the charmap contains the Uxxxxxxxx names. */
> + char utmp[10];
> + snprintf (utmp, sizeof (utmp), "U%08X", wch);
> + struct charseq *seq = charmap_find_value (charmap, utmp, 9);
> +
> + if (seq == NULL)
> + {
> + /* No, this isn't the case. Now determine from
> + the repertoire the name of the character and
> + find it in the charmap. */
> + if (repertoire != NULL)
> + {
> + const char *symbol = repertoire_find_symbol (repertoire, wch);
> + if (symbol != NULL)
> + seq = charmap_find_value (charmap, symbol, strlen (symbol));
> + }
> +
> + if (seq == NULL)
> + {
> +#ifndef NO_TRANSLITERATION
> + /* Transliterate if possible. */
> + if (locale != NULL)
> + {
> + if ((locale->avail & CTYPE_LOCALE) == 0)
> + {
> + /* Load the CTYPE data now. */
> + int old_needed = locale->needed;
> +
> + locale->needed = 0;
> + locale = load_locale (LC_CTYPE, locale->name,
> + locale->repertoire_name,
> + charmap, locale);
> + locale->needed = old_needed;
> + }
> +
> + uint32_t *translit;
> + if ((locale->avail & CTYPE_LOCALE) != 0
> + && ((translit = find_translit (locale, charmap, wch))
> + != NULL))
> + /* The CTYPE data contains a matching
> + transliteration. */
> + {
> + for (int i = 0; translit[i] != 0; ++i)
> + {
> + snprintf (utmp, sizeof (utmp), "U%08X", translit[i]);
> + seq = charmap_find_value (charmap, utmp, 9);
> + assert (seq != NULL);
> + adds (lrb, seq->bytes, seq->nbytes);
> + }
> + return true;
> + }
> + }
> +#endif /* NO_TRANSLITERATION */
> +
> + /* Not a known name. */
> + return false;
> + }
> + }
> +
> + if (seq != NULL)
> + {
> + adds (lrb, seq->bytes, seq->nbytes);
> + return true;
> + }
> + else
> + return false;
> +}
> +
>
> static struct token *
> get_string (struct linereader *lr, const struct charmap_t *charmap,
> @@ -635,7 +712,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
> }
> else
> {
> - int illegal_string = 0;
> + bool illegal_string = false;
> size_t buf2act = 0;
> size_t buf2max = 56 * sizeof (uint32_t);
> int ch;
> @@ -695,7 +772,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
> {
> /* <> is no correct name. Ignore it and also signal an
> error. */
> - illegal_string = 1;
> + illegal_string = true;
> continue;
> }
>
> @@ -709,8 +786,6 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>
> if (cp == &lrb.buf[lrb.act])
> {
> - char utmp[10];
> -
> /* Yes, it is. */
> addc (&lrb, '\0');
> wch = strtoul (lrb.buf + startidx + 1, NULL, 16);
> @@ -721,81 +796,9 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
> if (return_widestr)
> ADDWC (wch);
>
> - /* See whether the charmap contains the Uxxxxxxxx names. */
> - snprintf (utmp, sizeof (utmp), "U%08X", wch);
> - seq = charmap_find_value (charmap, utmp, 9);
> -
> - if (seq == NULL)
> - {
> - /* No, this isn't the case. Now determine from
> - the repertoire the name of the character and
> - find it in the charmap. */
> - if (repertoire != NULL)
> - {
> - const char *symbol;
> -
> - symbol = repertoire_find_symbol (repertoire, wch);
> -
> - if (symbol != NULL)
> - seq = charmap_find_value (charmap, symbol,
> - strlen (symbol));
> - }
> -
> - if (seq == NULL)
> - {
> -#ifndef NO_TRANSLITERATION
> - /* Transliterate if possible. */
> - if (locale != NULL)
> - {
> - uint32_t *translit;
> -
> - if ((locale->avail & CTYPE_LOCALE) == 0)
> - {
> - /* Load the CTYPE data now. */
> - int old_needed = locale->needed;
> -
> - locale->needed = 0;
> - locale = load_locale (LC_CTYPE,
> - locale->name,
> - locale->repertoire_name,
> - charmap, locale);
> - locale->needed = old_needed;
> - }
> -
> - if ((locale->avail & CTYPE_LOCALE) != 0
> - && ((translit = find_translit (locale,
> - charmap, wch))
> - != NULL))
> - /* The CTYPE data contains a matching
> - transliteration. */
> - {
> - int i;
> -
> - for (i = 0; translit[i] != 0; ++i)
> - {
> - char utmp[10];
> -
> - snprintf (utmp, sizeof (utmp), "U%08X",
> - translit[i]);
> - seq = charmap_find_value (charmap, utmp,
> - 9);
> - assert (seq != NULL);
> - adds (&lrb, seq->bytes, seq->nbytes);
> - }
> -
> - continue;
> - }
> - }
> -#endif /* NO_TRANSLITERATION */
> -
> - /* Not a known name. */
> - illegal_string = 1;
> - }
> - }
> -
> - if (seq != NULL)
> - adds (&lrb, seq->bytes, seq->nbytes);
> -
> + if (!translate_unicode_codepoint (locale, charmap,
> + repertoire, wch, &lrb))
> + illegal_string = true;
OK. Refactor.
> continue;
> }
> }
> @@ -812,7 +815,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
> /* This name is not in the charmap. */
> lr_error (lr, _("symbol `%.*s' not in charmap"),
> (int) (lrb.act - startidx), &lrb.buf[startidx]);
> - illegal_string = 1;
> + illegal_string = true;
> }
>
> if (return_widestr)
> @@ -833,7 +836,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
> /* This name is not in the repertoire map. */
> lr_error (lr, _("symbol `%.*s' not in repertoire map"),
> (int) (lrb.act - startidx), &lrb.buf[startidx]);
> - illegal_string = 1;
> + illegal_string = true;
> }
> else
> ADDWC (wch);
> @@ -850,7 +853,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
> if (ch == '\n' || ch == EOF)
> {
> lr_error (lr, _("unterminated string"));
> - illegal_string = 1;
> + illegal_string = true;
> }
>
> if (illegal_string)
OK.
--
Cheers,
Carlos.
next prev parent reply other threads:[~2022-07-04 19:54 UTC|newest]
Thread overview: 15+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-05-19 21:06 [PATCH 0/5] Assume UTF-8 encoding for localedef input files Florian Weimer
2022-05-19 21:06 ` [PATCH 1/5] locale: Turn ADDC and ADDS into functions in linereader.c Florian Weimer
2022-07-04 19:54 ` Carlos O'Donell
2022-05-19 21:06 ` [PATCH 2/5] locale: Fix signed char bug in lr_getc Florian Weimer
2022-07-04 19:54 ` Carlos O'Donell
2022-05-19 21:06 ` [PATCH 3/5] locale: Introduce translate_unicode_codepoint into linereader.c Florian Weimer
2022-07-04 19:54 ` Carlos O'Donell [this message]
2022-05-19 21:06 ` [PATCH 4/5] locale: localdef input files are now encoded in UTF-8 Florian Weimer
2022-07-04 19:54 ` Carlos O'Donell
2022-05-19 21:06 ` [PATCH 5/5] de_DE: Convert to UTF-8 Florian Weimer
2022-07-04 19:54 ` Carlos O'Donell
2022-07-05 9:27 ` Andreas Schwab
2022-07-05 9:55 ` Florian Weimer
2022-07-05 10:38 ` Andreas Schwab
2022-07-04 19:54 ` [PATCH 0/5] Assume UTF-8 encoding for localedef input files Carlos O'Donell
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=b7235300-0704-065f-689b-8ac074230522@redhat.com \
--to=carlos@redhat.com \
--cc=fweimer@redhat.com \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).