Re: [PATCH 3/5] locale: Introduce translate_unicode_codepoint into linereader.c

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

From: Carlos O'Donell <carlos@redhat.com>
To: Florian Weimer <fweimer@redhat.com>, libc-alpha@sourceware.org
Subject: Re: [PATCH 3/5] locale: Introduce translate_unicode_codepoint into linereader.c
Date: Mon, 4 Jul 2022 15:54:15 -0400	[thread overview]
Message-ID: <b7235300-0704-065f-689b-8ac074230522@redhat.com> (raw)
In-Reply-To: <a89cee054d28d43cf8f7e5f171e876326e4af96e.1652994079.git.fweimer@redhat.com>

On 5/19/22 17:06, Florian Weimer via Libc-alpha wrote:
> This will permit reusing the Unicode character processing for
> different character encodings, not just the current <U...> encoding.

LGTM. Straight forward refactor.

Reviewed-by: Carlos O'Donell <carlos@redhat.com>
Tested-by: Carlos O'Donell <carlos@redhat.com>

> ---
>  locale/programs/linereader.c | 167 ++++++++++++++++++-----------------
>  1 file changed, 85 insertions(+), 82 deletions(-)
> 
> diff --git a/locale/programs/linereader.c b/locale/programs/linereader.c
> index d5367e0a1e..f7292f0102 100644
> --- a/locale/programs/linereader.c
> +++ b/locale/programs/linereader.c
> @@ -596,6 +596,83 @@ get_ident (struct linereader *lr)
>    return &lr->token;
>  }
>  
> +/* Process a decoded Unicode codepoint WCH in a string, placing the
> +   multibyte sequence into LRB.  Return false if the character is not
> +   found in CHARMAP/REPERTOIRE.  */
> +static bool
> +translate_unicode_codepoint (struct localedef_t *locale,
> +			     const struct charmap_t *charmap,
> +			     const struct repertoire_t *repertoire,
> +			     uint32_t wch, struct lr_buffer *lrb)
> +{
> +  /* See whether the charmap contains the Uxxxxxxxx names.  */
> +  char utmp[10];
> +  snprintf (utmp, sizeof (utmp), "U%08X", wch);
> +  struct charseq *seq = charmap_find_value (charmap, utmp, 9);
> +
> +  if (seq == NULL)
> +    {
> +      /* No, this isn't the case.  Now determine from
> +	 the repertoire the name of the character and
> +	 find it in the charmap.  */
> +      if (repertoire != NULL)
> +	{
> +	  const char *symbol = repertoire_find_symbol (repertoire, wch);
> +	  if (symbol != NULL)
> +	    seq = charmap_find_value (charmap, symbol, strlen (symbol));
> +	}
> +
> +      if (seq == NULL)
> +	{
> +#ifndef NO_TRANSLITERATION
> +	  /* Transliterate if possible.  */
> +	  if (locale != NULL)
> +	    {
> +	      if ((locale->avail & CTYPE_LOCALE) == 0)
> +		{
> +		  /* Load the CTYPE data now.  */
> +		  int old_needed = locale->needed;
> +
> +		  locale->needed = 0;
> +		  locale = load_locale (LC_CTYPE, locale->name,
> +					locale->repertoire_name,
> +					charmap, locale);
> +		  locale->needed = old_needed;
> +		}
> +
> +	      uint32_t *translit;
> +	      if ((locale->avail & CTYPE_LOCALE) != 0
> +		  && ((translit = find_translit (locale, charmap, wch))
> +		      != NULL))
> +		/* The CTYPE data contains a matching
> +		   transliteration.  */
> +		{
> +		  for (int i = 0; translit[i] != 0; ++i)
> +		    {
> +		      snprintf (utmp, sizeof (utmp), "U%08X", translit[i]);
> +		      seq = charmap_find_value (charmap, utmp, 9);
> +		      assert (seq != NULL);
> +		      adds (lrb, seq->bytes, seq->nbytes);
> +		    }
> +		  return true;
> +		}
> +	    }
> +#endif	/* NO_TRANSLITERATION */
> +
> +	  /* Not a known name.  */
> +	  return false;
> +	}
> +    }
> +
> +  if (seq != NULL)
> +    {
> +      adds (lrb, seq->bytes, seq->nbytes);
> +      return true;
> +    }
> +  else
> +    return false;
> +}
> +
>  
>  static struct token *
>  get_string (struct linereader *lr, const struct charmap_t *charmap,
> @@ -635,7 +712,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>      }
>    else
>      {
> -      int illegal_string = 0;
> +      bool illegal_string = false;
>        size_t buf2act = 0;
>        size_t buf2max = 56 * sizeof (uint32_t);
>        int ch;
> @@ -695,7 +772,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  	    {
>  	      /* <> is no correct name.  Ignore it and also signal an
>  		 error.  */
> -	      illegal_string = 1;
> +	      illegal_string = true;
>  	      continue;
>  	    }
>  
> @@ -709,8 +786,6 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  
>  	      if (cp == &lrb.buf[lrb.act])
>  		{
> -		  char utmp[10];
> -
>  		  /* Yes, it is.  */
>  		  addc (&lrb, '\0');
>  		  wch = strtoul (lrb.buf + startidx + 1, NULL, 16);
> @@ -721,81 +796,9 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  		  if (return_widestr)
>  		    ADDWC (wch);
>  
> -		  /* See whether the charmap contains the Uxxxxxxxx names.  */
> -		  snprintf (utmp, sizeof (utmp), "U%08X", wch);
> -		  seq = charmap_find_value (charmap, utmp, 9);
> -
> -		  if (seq == NULL)
> -		    {
> -		     /* No, this isn't the case.  Now determine from
> -			the repertoire the name of the character and
> -			find it in the charmap.  */
> -		      if (repertoire != NULL)
> -			{
> -			  const char *symbol;
> -
> -			  symbol = repertoire_find_symbol (repertoire, wch);
> -
> -			  if (symbol != NULL)
> -			    seq = charmap_find_value (charmap, symbol,
> -						      strlen (symbol));
> -			}
> -
> -		      if (seq == NULL)
> -			{
> -#ifndef NO_TRANSLITERATION
> -			  /* Transliterate if possible.  */
> -			  if (locale != NULL)
> -			    {
> -			      uint32_t *translit;
> -
> -			      if ((locale->avail & CTYPE_LOCALE) == 0)
> -				{
> -				  /* Load the CTYPE data now.  */
> -				  int old_needed = locale->needed;
> -
> -				  locale->needed = 0;
> -				  locale = load_locale (LC_CTYPE,
> -							locale->name,
> -							locale->repertoire_name,
> -							charmap, locale);
> -				  locale->needed = old_needed;
> -				}
> -
> -			      if ((locale->avail & CTYPE_LOCALE) != 0
> -				  && ((translit = find_translit (locale,
> -								 charmap, wch))
> -				      != NULL))
> -				/* The CTYPE data contains a matching
> -				   transliteration.  */
> -				{
> -				  int i;
> -
> -				  for (i = 0; translit[i] != 0; ++i)
> -				    {
> -				      char utmp[10];
> -
> -				      snprintf (utmp, sizeof (utmp), "U%08X",
> -						translit[i]);
> -				      seq = charmap_find_value (charmap, utmp,
> -								9);
> -				      assert (seq != NULL);
> -				      adds (&lrb, seq->bytes, seq->nbytes);
> -				    }
> -
> -				  continue;
> -				}
> -			    }
> -#endif	/* NO_TRANSLITERATION */
> -
> -			  /* Not a known name.  */
> -			  illegal_string = 1;
> -			}
> -		    }
> -
> -		  if (seq != NULL)
> -		    adds (&lrb, seq->bytes, seq->nbytes);
> -
> +		  if (!translate_unicode_codepoint (locale, charmap,
> +						    repertoire, wch, &lrb))
> +		    illegal_string = true;

OK. Refactor.

>  		  continue;
>  		}
>  	    }
> @@ -812,7 +815,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  	      /* This name is not in the charmap.  */
>  	      lr_error (lr, _("symbol `%.*s' not in charmap"),
>  			(int) (lrb.act - startidx), &lrb.buf[startidx]);
> -	      illegal_string = 1;
> +	      illegal_string = true;
>  	    }
>  
>  	  if (return_widestr)
> @@ -833,7 +836,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>  		  /* This name is not in the repertoire map.  */
>  		  lr_error (lr, _("symbol `%.*s' not in repertoire map"),
>  			    (int) (lrb.act - startidx), &lrb.buf[startidx]);
> -		  illegal_string = 1;
> +		  illegal_string = true;
>  		}
>  	      else
>  		ADDWC (wch);
> @@ -850,7 +853,7 @@ get_string (struct linereader *lr, const struct charmap_t *charmap,
>        if (ch == '\n' || ch == EOF)
>  	{
>  	  lr_error (lr, _("unterminated string"));
> -	  illegal_string = 1;
> +	  illegal_string = true;
>  	}
>  
>        if (illegal_string)

OK.

-- 
Cheers,
Carlos.

next prev parent reply	other threads:[~2022-07-04 19:54 UTC|newest]

Thread overview: 15+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-05-19 21:06 [PATCH 0/5] Assume UTF-8 encoding for localedef input files Florian Weimer
2022-05-19 21:06 ` [PATCH 1/5] locale: Turn ADDC and ADDS into functions in linereader.c Florian Weimer
2022-07-04 19:54   ` Carlos O'Donell
2022-05-19 21:06 ` [PATCH 2/5] locale: Fix signed char bug in lr_getc Florian Weimer
2022-07-04 19:54   ` Carlos O'Donell
2022-05-19 21:06 ` [PATCH 3/5] locale: Introduce translate_unicode_codepoint into linereader.c Florian Weimer
2022-07-04 19:54   ` Carlos O'Donell [this message]
2022-05-19 21:06 ` [PATCH 4/5] locale: localdef input files are now encoded in UTF-8 Florian Weimer
2022-07-04 19:54   ` Carlos O'Donell
2022-05-19 21:06 ` [PATCH 5/5] de_DE: Convert to UTF-8 Florian Weimer
2022-07-04 19:54   ` Carlos O'Donell
2022-07-05  9:27   ` Andreas Schwab
2022-07-05  9:55     ` Florian Weimer
2022-07-05 10:38       ` Andreas Schwab
2022-07-04 19:54 ` [PATCH 0/5] Assume UTF-8 encoding for localedef input files Carlos O'Donell

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=b7235300-0704-065f-689b-8ac074230522@redhat.com \
    --to=carlos@redhat.com \
    --cc=fweimer@redhat.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).