public inbox for gcc-patches@gcc.gnu.org
 help / color / mirror / Atom feed
From: Marek Polacek <polacek@redhat.com>
To: David Malcolm <dmalcolm@redhat.com>
Cc: Joseph Myers <joseph@codesourcery.com>,
	Jakub Jelinek <jakub@redhat.com>,
	Martin Sebor <msebor@redhat.com>,
	GCC Patches <gcc-patches@gcc.gnu.org>
Subject: Re: [PATCH 2/2] libcpp: capture and underline ranges in -Wbidi-chars= [PR103026]
Date: Wed, 17 Nov 2021 18:01:37 -0500	[thread overview]
Message-ID: <YZWJ0Uxm241d+aBG@redhat.com> (raw)
In-Reply-To: <20211117224515.2484625-2-dmalcolm@redhat.com>

On Wed, Nov 17, 2021 at 05:45:15PM -0500, David Malcolm wrote:
> This patch converts the bidi::vec to use a struct so that we can
> capture location_t values for the bidirectional control characters.

Thanks for these improvements.  I noticed a few nits, but nothing that
needs to be fixed immediately.

> --- a/libcpp/lex.c
> +++ b/libcpp/lex.c
> @@ -1172,11 +1172,34 @@ namespace bidi {
>    /* All the UTF-8 encodings of bidi characters start with E2.  */
>    constexpr uchar utf8_start = 0xe2;
>  
> +  struct context
> +  {
> +    context () {}
> +    context (location_t loc, kind k, bool pdf, bool ucn)
> +    : m_loc (loc), m_kind (k), m_pdf (pdf), m_ucn (ucn)
> +    {
> +    }
> +
> +    kind get_pop_kind () const
> +    {
> +      return m_pdf ? kind::PDF : kind::PDI;
> +    }
> +    bool ucn_p () const
> +    {
> +      return m_ucn;
> +    }
> +
> +    location_t m_loc;
> +    kind m_kind;
> +    unsigned m_pdf : 1;
> +    unsigned m_ucn : 1;

Should these members be private:, since we have getters for them?

> +  };
> +
>    /* A vector holding currently open bidi contexts.  We use a char for
>       each context, its LSB is 1 if it represents a PDF context, 0 if it
>       represents a PDI context.  The next bit is 1 if this context was open
>       by a bidi character written as a UCN, and 0 when it was UTF-8.  */

Looks like this comments needs to be updated now.

> -  semi_embedded_vec <unsigned char, 16> vec;
> +  semi_embedded_vec <context, 16> vec;
>  
>    /* Close the whole comment/identifier/string literal/character constant
>       context.  */
> @@ -1193,19 +1216,19 @@ namespace bidi {
>      vec.truncate (len - 1);
>    }
>  
> -  /* Return the context of the Ith element.  */
> -  kind ctx_at (unsigned int i)
> +  /* Return the pop kind of the context of the Ith element.  */
> +  kind pop_kind_at (unsigned int i)
>    {
> -    return (vec[i] & 1) ? kind::PDF : kind::PDI;
> +    return vec[i].get_pop_kind ();
>    }
>  
> -  /* Return which context is currently opened.  */
> +  /* Return the pop kind of the context that is currently opened.  */
>    kind current_ctx ()
>    {
>      unsigned int len = vec.count ();
>      if (len == 0)
>        return kind::NONE;
> -    return ctx_at (len - 1);
> +    return vec[len - 1].get_pop_kind ();
>    }
>  
>    /* Return true if the current context comes from a UCN origin, that is,
> @@ -1214,11 +1237,19 @@ namespace bidi {
>    {
>      unsigned int len = vec.count ();
>      gcc_checking_assert (len > 0);
> -    return (vec[len - 1] >> 1) & 1;
> +    return vec[len - 1].m_ucn;
>    }
>  
> -  /* We've read a bidi char, update the current vector as necessary.  */
> -  void on_char (kind k, bool ucn_p)
> +  location_t current_ctx_loc ()
> +  {
> +    unsigned int len = vec.count ();
> +    gcc_checking_assert (len > 0);
> +    return vec[len - 1].m_loc;
> +  }
> +
> +  /* We've read a bidi char, update the current vector as necessary.
> +     LOC is only valid when K is not kind::NONE.  */
> +  void on_char (kind k, bool ucn_p, location_t loc)
>    {
>      switch (k)
>        {
> @@ -1226,12 +1257,12 @@ namespace bidi {
>        case kind::RLE:
>        case kind::LRO:
>        case kind::RLO:
> -	vec.push (ucn_p ? 3u : 1u);
> +	vec.push (context (loc, k, true, ucn_p));
>  	break;
>        case kind::LRI:
>        case kind::RLI:
>        case kind::FSI:
> -	vec.push (ucn_p ? 2u : 0u);
> +	vec.push (context (loc, k, false, ucn_p));
>  	break;
>        /* PDF terminates the scope of the last LRE, RLE, LRO, or RLO
>  	 whose scope has not yet been terminated.  */
> @@ -1245,7 +1276,7 @@ namespace bidi {
>  	 yet been terminated.  */
>        case kind::PDI:
>  	for (int i = vec.count () - 1; i >= 0; --i)
> -	  if (ctx_at (i) == kind::PDI)
> +	  if (pop_kind_at (i) == kind::PDI)
>  	    {
>  	      vec.truncate (i);
>  	      break;
> @@ -1295,10 +1326,47 @@ namespace bidi {
>    }
>  }
>  
> +/* Get location_t for the range of bytes [START, START + NUM_BYTES)
> +   within the current line in FILE, with the caret at START.  */
> +
> +static location_t
> +get_location_for_byte_range_in_cur_line (cpp_reader *pfile,
> +					 const unsigned char *const start,
> +					 size_t num_bytes)
> +{
> +  gcc_checking_assert (num_bytes > 0);
> +
> +  /* CPP_BUF_COLUMN and linemap_position_for_column both refer
> +     to offsets in bytes, but CPP_BUF_COLUMN is 0-based,
> +     whereas linemap_position_for_column is 1-based.  */
> +
> +  /* Get 0-based offsets within the line.  */
> +  size_t start_offset = CPP_BUF_COLUMN (pfile->buffer, start);
> +  size_t end_offset = start_offset + num_bytes - 1;
> +
> +  /* Now convert to location_t, where "columns" are 1-based byte offsets.  */
> +  location_t start_loc = linemap_position_for_column (pfile->line_table,
> +						      start_offset + 1);
> +  location_t end_loc = linemap_position_for_column (pfile->line_table,
> +						     end_offset + 1);
> +
> +  if (start_loc == end_loc)
> +    return start_loc;
> +
> +  source_range src_range;
> +  src_range.m_start = start_loc;
> +  src_range.m_finish = end_loc;
> +  location_t combined_loc = COMBINE_LOCATION_DATA (pfile->line_table,
> +						   start_loc,
> +						   src_range,
> +						   NULL);
> +  return combined_loc;
> +}
> +
>  /* Parse a sequence of 3 bytes starting with P and return its bidi code.  */
>  
>  static bidi::kind
> -get_bidi_utf8 (const unsigned char *const p)
> +get_bidi_utf8_1 (const unsigned char *const p)
>  {
>    gcc_checking_assert (p[0] == bidi::utf8_start);
>  
> @@ -1340,10 +1408,25 @@ get_bidi_utf8 (const unsigned char *const p)
>    return bidi::kind::NONE;
>  }
>  
> +/* Parse a sequence of 3 bytes starting with P and return its bidi code.
> +   If the kind is not NONE, write the location to *OUT.*/

'.  */' at the end

> +
> +static bidi::kind
> +get_bidi_utf8 (cpp_reader *pfile, const unsigned char *const p, location_t *out)
> +{
> +  bidi::kind result = get_bidi_utf8_1 (p);
> +  if (result != bidi::kind::NONE)
> +    {
> +      /* We have a sequence of 3 bytes starting at P.  */
> +      *out = get_location_for_byte_range_in_cur_line (pfile, p, 3);
> +    }
> +  return result;
> +}
> +
>  /* Parse a UCN where P points just past \u or \U and return its bidi code.  */
>  
>  static bidi::kind
> -get_bidi_ucn (const unsigned char *p, bool is_U)
> +get_bidi_ucn_1 (const unsigned char *p, bool is_U)
>  {
>    /* 6.4.3 Universal Character Names
>        \u hex-quad
> @@ -1412,6 +1495,62 @@ get_bidi_ucn (const unsigned char *p, bool is_U)
>    return bidi::kind::NONE;
>  }
>  
> +/* Parse a UCN where P points just past \u or \U and return its bidi code.
> +   If the kind is not NONE, write the location to *OUT.*/
> +
> +static bidi::kind
> +get_bidi_ucn (cpp_reader *pfile,  const unsigned char *p, bool is_U,

Two spaces before 'const unsigned char'.


> +	      location_t *out)
> +{
> +  bidi::kind result = get_bidi_ucn_1 (p, is_U);
> +  if (result != bidi::kind::NONE)
> +    {
> +      const unsigned char *start = p - 2;
> +      size_t num_bytes = 2 + (is_U ? 8 : 4);
> +      *out = get_location_for_byte_range_in_cur_line (pfile, start, num_bytes);
> +    }
> +  return result;
> +}
> +
> +/* Subclass of rich_location for reporting on unpaired UTF-8
> +   bidirectional control character(s).
> +   Escape the source lines on output, and show all unclosed
> +   bidi context, labelling everything.  */
> +
> +class unpaired_bidi_rich_location : public rich_location
> +{
> + public:
> +  class custom_range_label : public range_label
> +  {
> +   public:
> +     label_text get_text (unsigned range_idx) const FINAL OVERRIDE
> +     {
> +       /* range 0 is the primary location; each subsequent range i + 1
> +	  is for bidi::vec[i].  */
> +       if (range_idx > 0)
> +	 {
> +	   const bidi::context &ctxt (bidi::vec[range_idx - 1]);
> +	   return label_text::borrow (bidi::to_str (ctxt.m_kind));
> +	 }
> +       else
> +	 return label_text::borrow (_("end of bidirectional context"));
> +     }
> +  };
> +
> +  unpaired_bidi_rich_location (cpp_reader *pfile, location_t loc)
> +  : rich_location (pfile->line_table, loc, &m_custom_label)
> +  {
> +    set_escape_on_output (true);
> +    for (unsigned i = 0; i < bidi::vec.count (); i++)
> +      add_range (bidi::vec[i].m_loc,
> +		 SHOW_RANGE_WITHOUT_CARET,
> +		 &m_custom_label);
> +  }
> +
> + private:
> +   custom_range_label m_custom_label;
> +};
> +
>  /* We're closing a bidi context, that is, we've encountered a newline,
>     are closing a C-style comment, or are at the end of a string literal,
>     character constant, or identifier.  Warn if this context was not
> @@ -1427,11 +1566,17 @@ maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
>        const location_t loc
>  	= linemap_position_for_column (pfile->line_table,
>  				       CPP_BUF_COLUMN (pfile->buffer, p));
> -      rich_location rich_loc (pfile->line_table, loc);
> -      rich_loc.set_escape_on_output (true);
> -      cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
> -		      "unpaired UTF-8 bidirectional control character "
> -		      "detected");
> +      unpaired_bidi_rich_location rich_loc (pfile, loc);
> +      /* cpp_callbacks doesn't yet have a way to handle singular vs plural
> +	 forms of a diagnostic, so fake it for now.  */
> +      if (bidi::vec.count () > 1)
> +	cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
> +			"unpaired UTF-8 bidirectional control characters "
> +			"detected");
> +      else
> +	cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
> +			"unpaired UTF-8 bidirectional control character "
> +			"detected");
>      }
>    /* We're done with this context.  */
>    bidi::on_close ();
> @@ -1439,12 +1584,13 @@ maybe_warn_bidi_on_close (cpp_reader *pfile, const uchar *p)
>  
>  /* We're at the beginning or in the middle of an identifier/comment/string
>     literal/character constant.  Warn if we've encountered a bidi character.
> -   KIND says which bidi character it was; P points to it in the character
> -   stream.  UCN_P is true iff this bidi character was written as a UCN.  */
> +   KIND says which bidi control character it was; UCN_P is true iff this bidi
> +   control character was written as a UCN.  LOC is the location of the
> +   character, but is only valid if KIND != bidi::kind::NONE.  */
>  
>  static void
> -maybe_warn_bidi_on_char (cpp_reader *pfile, const uchar *p, bidi::kind kind,
> -			 bool ucn_p)
> +maybe_warn_bidi_on_char (cpp_reader *pfile, bidi::kind kind,
> +			 bool ucn_p, location_t loc)
>  {
>    if (__builtin_expect (kind == bidi::kind::NONE, 1))
>      return;
> @@ -1453,9 +1599,6 @@ maybe_warn_bidi_on_char (cpp_reader *pfile, const uchar *p, bidi::kind kind,
>  
>    if (warn_bidi != bidirectional_none)
>      {
> -      const location_t loc
> -	= linemap_position_for_column (pfile->line_table,
> -				       CPP_BUF_COLUMN (pfile->buffer, p));
>        rich_location rich_loc (pfile->line_table, loc);
>        rich_loc.set_escape_on_output (true);
>  
> @@ -1467,9 +1610,12 @@ maybe_warn_bidi_on_char (cpp_reader *pfile, const uchar *p, bidi::kind kind,
>  	{
>  	  if (warn_bidi == bidirectional_unpaired
>  	      && bidi::current_ctx_ucn_p () != ucn_p)
> -	    cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
> -			    "UTF-8 vs UCN mismatch when closing "
> -			    "a context by \"%s\"", bidi::to_str (kind));
> +	    {
> +	      rich_loc.add_range (bidi::current_ctx_loc ());
> +	      cpp_warning_at (pfile, CPP_W_BIDIRECTIONAL, &rich_loc,
> +			      "UTF-8 vs UCN mismatch when closing "
> +			      "a context by \"%s\"", bidi::to_str (kind));
> +	    }
>  	}
>        else if (warn_bidi == bidirectional_any)
>  	{
> @@ -1484,7 +1630,7 @@ maybe_warn_bidi_on_char (cpp_reader *pfile, const uchar *p, bidi::kind kind,
>  	}
>      }
>    /* We're done with this context.  */
> -  bidi::on_char (kind, ucn_p);
> +  bidi::on_char (kind, ucn_p, loc);
>  }
>  
>  /* Skip a C-style block comment.  We find the end of the comment by
> @@ -1552,8 +1698,9 @@ _cpp_skip_block_comment (cpp_reader *pfile)
>  	 a bidirectional control character.  */
>        else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
>  	{
> -	  bidi::kind kind = get_bidi_utf8 (cur - 1);
> -	  maybe_warn_bidi_on_char (pfile, cur, kind, /*ucn_p=*/false);
> +	  location_t loc;
> +	  bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
> +	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
>  	}
>      }
>  
> @@ -1586,9 +1733,9 @@ skip_line_comment (cpp_reader *pfile)
>  	    {
>  	      if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0))
>  		{
> -		  bidi::kind kind = get_bidi_utf8 (buffer->cur);
> -		  maybe_warn_bidi_on_char (pfile, buffer->cur, kind,
> -					   /*ucn_p=*/false);
> +		  location_t loc;
> +		  bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
> +		  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
>  		}
>  	      buffer->cur++;
>  	    }
> @@ -1737,9 +1884,9 @@ forms_identifier_p (cpp_reader *pfile, int first,
>  	  if (__builtin_expect (*buffer->cur == bidi::utf8_start, 0)
>  	      && warn_bidi_p)
>  	    {
> -	      bidi::kind kind = get_bidi_utf8 (buffer->cur);
> -	      maybe_warn_bidi_on_char (pfile, buffer->cur, kind,
> -				       /*ucn_p=*/false);
> +	      location_t loc;
> +	      bidi::kind kind = get_bidi_utf8 (pfile, buffer->cur, &loc);
> +	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
>  	    }
>  	  if (_cpp_valid_utf8 (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
>  			       state, &s))
> @@ -1751,10 +1898,12 @@ forms_identifier_p (cpp_reader *pfile, int first,
>  	  buffer->cur += 2;
>  	  if (warn_bidi_p)
>  	    {
> -	      bidi::kind kind = get_bidi_ucn (buffer->cur,
> -					      buffer->cur[-1] == 'U');
> -	      maybe_warn_bidi_on_char (pfile, buffer->cur, kind,
> -				       /*ucn_p=*/true);
> +	      location_t loc;
> +	      bidi::kind kind = get_bidi_ucn (pfile,
> +					      buffer->cur,
> +					      buffer->cur[-1] == 'U',
> +					      &loc);
> +	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
>  	    }
>  	  if (_cpp_valid_ucn (pfile, &buffer->cur, buffer->rlimit, 1 + !first,
>  			      state, &s, NULL, NULL))
> @@ -2375,8 +2524,11 @@ lex_raw_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
>  	}
>        else if (__builtin_expect ((unsigned char) c == bidi::utf8_start, 0)
>  	       && warn_bidi_p)
> -	maybe_warn_bidi_on_char (pfile, pos - 1, get_bidi_utf8 (pos - 1),
> -				 /*ucn_p=*/false);
> +	{
> +	  location_t loc;
> +	  bidi::kind kind = get_bidi_utf8 (pfile, pos - 1, &loc);
> +	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
> +	}
>      }
>  
>    if (warn_bidi_p)
> @@ -2486,8 +2638,10 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
>  	{
>  	  if ((cur[0] == 'u' || cur[0] == 'U') && warn_bidi_p)
>  	    {
> -	      bidi::kind kind = get_bidi_ucn (cur + 1, cur[0] == 'U');
> -	      maybe_warn_bidi_on_char (pfile, cur, kind, /*ucn_p=*/true);
> +	      location_t loc;
> +	      bidi::kind kind = get_bidi_ucn (pfile, cur + 1, cur[0] == 'U',
> +					      &loc);
> +	      maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/true, loc);
>  	    }
>  	  cur++;
>  	}
> @@ -2515,8 +2669,9 @@ lex_string (cpp_reader *pfile, cpp_token *token, const uchar *base)
>  	saw_NUL = true;
>        else if (__builtin_expect (c == bidi::utf8_start, 0) && warn_bidi_p)
>  	{
> -	  bidi::kind kind = get_bidi_utf8 (cur - 1);
> -	  maybe_warn_bidi_on_char (pfile, cur - 1, kind, /*ucn_p=*/false);
> +	  location_t loc;
> +	  bidi::kind kind = get_bidi_utf8 (pfile, cur - 1, &loc);
> +	  maybe_warn_bidi_on_char (pfile, kind, /*ucn_p=*/false, loc);
>  	}
>      }
>  
> -- 
> 2.26.3
> 

Marek


  reply	other threads:[~2021-11-17 23:01 UTC|newest]

Thread overview: 27+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-11-01 16:36 [PATCH] libcpp: Implement -Wbidirectional for CVE-2021-42574 [PR103026] Marek Polacek
2021-11-01 22:10 ` Joseph Myers
2021-11-02 17:18   ` [PATCH v2] " Marek Polacek
2021-11-02 19:20     ` Martin Sebor
2021-11-02 19:52       ` Marek Polacek
2021-11-08 21:33         ` Marek Polacek
2021-11-15 17:28           ` [PATCH] libcpp: Implement -Wbidi-chars " Marek Polacek
2021-11-15 23:15             ` David Malcolm
2021-11-16 19:50               ` [PATCH v2] " Marek Polacek
2021-11-16 23:00                 ` David Malcolm
2021-11-17  0:37                   ` [PATCH v3] " Marek Polacek
2021-11-17  2:28                     ` David Malcolm
2021-11-17  3:05                       ` Marek Polacek
2021-11-17 22:45                         ` [committed] libcpp: escape non-ASCII source bytes in -Wbidi-chars= [PR103026] David Malcolm
2021-11-17 22:45                           ` [PATCH 2/2] libcpp: capture and underline ranges " David Malcolm
2021-11-17 23:01                             ` Marek Polacek [this message]
2021-11-30  8:38             ` [PATCH] libcpp: Implement -Wbidi-chars for CVE-2021-42574 [PR103026] Stephan Bergmann
2021-11-30 13:26               ` Marek Polacek
2021-11-30 15:00                 ` Stephan Bergmann
2021-11-30 15:27                   ` Marek Polacek
2022-01-14  9:23                     ` Stephan Bergmann
2022-01-14 13:28                       ` Marek Polacek
2022-01-14 14:52                         ` Stephan Bergmann
2021-11-02 20:57 ` [PATCH 0/2] Re: [PATCH] libcpp: Implement -Wbidirectional " David Malcolm
2021-11-02 20:58   ` [PATCH 1/2] Flag CPP_W_BIDIRECTIONAL so that source lines are escaped David Malcolm
2021-11-02 21:07     ` David Malcolm
2021-11-02 20:58   ` [PATCH 2/2] Capture locations of bidi chars and underline ranges David Malcolm

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=YZWJ0Uxm241d+aBG@redhat.com \
    --to=polacek@redhat.com \
    --cc=dmalcolm@redhat.com \
    --cc=gcc-patches@gcc.gnu.org \
    --cc=jakub@redhat.com \
    --cc=joseph@codesourcery.com \
    --cc=msebor@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).