public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-7676] gccrs: tokenize Unicode identifiers
@ 2024-01-16 17:52 Arthur Cohen
0 siblings, 0 replies; only message in thread
From: Arthur Cohen @ 2024-01-16 17:52 UTC (permalink / raw)
To: gcc-cvs
https://gcc.gnu.org/g:61644aea34c4623d16273ff705f8b8b1ff2d87f0
commit r14-7676-g61644aea34c4623d16273ff705f8b8b1ff2d87f0
Author: Raiki Tamura <tamaron1203@gmail.com>
Date: Mon Jun 19 18:06:11 2023 +0900
gccrs: tokenize Unicode identifiers
gcc/rust/ChangeLog:
* lex/rust-lex.cc (is_whitespace):add all lacked codepoints valid as whitespaces
(is_identifier_start):new function to check XID_Start and underscore
(is_identifier_continue):new function to check XID_Continue
(Lexer::build_token):tokenize Unicode identifiers
(Lexer::parse_partial_string_continue):add comments
(Lexer::parse_partial_unicode_escape):add comments
(Lexer::parse_raw_identifier):change to use `is_identifier_scontinue`
(Lexer::parse_identifier_or_keyword):change to use `is_identifier_continue`
(Lexer::parse_char_or_lifetime):change to use `is_identifier_start/continue`
(Lexer::skip_codepoint_input):do not attempt to skip input when bumping EOF
* lex/rust-lex.h:add `current_char32` field
Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>
Diff:
---
gcc/rust/lex/rust-lex.cc | 87 +++++++++++++++++++++++++++++++-----------------
gcc/rust/lex/rust-lex.h | 6 ++--
2 files changed, 61 insertions(+), 32 deletions(-)
diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc
index 23579e51944..94ec67d2e66 100644
--- a/gcc/rust/lex/rust-lex.cc
+++ b/gcc/rust/lex/rust-lex.cc
@@ -22,6 +22,7 @@
#include "rust-linemap.h"
#include "rust-session-manager.h"
#include "safe-ctype.h"
+#include "cpplib.h"
namespace Rust {
// TODO: move to separate compilation unit?
@@ -103,11 +104,17 @@ check_valid_float_dot_end (char character)
return character != '.' && character != '_' && !ISALPHA (character);
}
-// ISSPACE from safe-ctype but may change in future
bool
-is_whitespace (char character)
+is_whitespace (int character)
{
- return ISSPACE (character);
+ // https://doc.rust-lang.org/reference/whitespace.html
+ return character == '\t' || character == '\n' || character == '\v'
+ || character == '\f' || character == '\r' || character == ' '
+ || character == 0x0085 // next line
+ || character == 0x200e // left-to-right mark
+ || character == 0x200f // right-to-left mark
+ || character == 0x2028 // line separator
+ || character == 0x2029; // pragraph separator
}
bool
@@ -116,6 +123,18 @@ is_non_decimal_int_literal_separator (char character)
return character == 'x' || character == 'o' || character == 'b';
}
+bool
+is_identifier_start (int codepoint)
+{
+ return (cpp_check_xid_property (codepoint) & CPP_XID_START) || codepoint == '_';
+}
+
+bool
+is_identifier_continue (int codepoint)
+{
+ return cpp_check_xid_property (codepoint) & CPP_XID_CONTINUE;
+}
+
Lexer::Lexer (const std::string &input)
: input (RAIIFile::create_error ()), current_line (1), current_column (1),
line_map (nullptr), dump_lex_out (Optional<std::ofstream &>::none ()),
@@ -284,22 +303,22 @@ Lexer::build_token ()
while (true)
{
Location loc = get_current_location ();
- current_char = peek_input ();
- skip_input ();
// detect UTF8 bom
//
// Must be the first thing on the first line.
// There might be an optional BOM (Byte Order Mark), which for UTF-8 is
// the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped.
- if (current_line == 1 && current_column == 1 && current_char == 0xef
- && peek_input () == 0xbb && peek_input (1) == 0xbf)
+ if (current_line == 1 && current_column == 1 && peek_input () == 0xef
+ && peek_input (1) == 0xbb && peek_input (2) == 0xbf)
{
- skip_input (1);
- current_char = peek_input ();
- skip_input ();
+ skip_input (2);
}
+ current_char = peek_input ();
+ current_char32 = peek_codepoint_input ();
+ skip_codepoint_input ();
+
// detect shebang
// Must be the first thing on the first line, starting with #!
// But since an attribute can also start with an #! we don't count it as a
@@ -312,6 +331,7 @@ Lexer::build_token ()
int n = 1;
while (true)
{
+ // TODO use utf-8 codepoint to skip whitespaces
int next_char = peek_input (n);
if (is_whitespace (next_char))
n++;
@@ -1052,7 +1072,8 @@ Lexer::build_token ()
int peek = peek_input ();
int peek1 = peek_input (1);
- if (peek == '#' && (ISALPHA (peek1) || peek1 == '_'))
+ // TODO (tamaron) parse Unicode ident
+ if (peek == '#' && is_identifier_start (peek1))
{
TokenPtr raw_ident_ptr = parse_raw_identifier (loc);
if (raw_ident_ptr != nullptr)
@@ -1069,8 +1090,8 @@ Lexer::build_token ()
}
}
- // find identifiers and keywords
- if (ISALPHA (current_char) || current_char == '_')
+ // find identifiers and keywords.
+ if (is_identifier_start (current_char32.value))
return parse_identifier_or_keyword (loc);
// int and float literals
@@ -1468,6 +1489,7 @@ Lexer::parse_partial_string_continue ()
int additional_length_offset = 1;
// string continue
+ // TODO use utf-8 codepoint to skip whitespaces
while (is_whitespace (current_char))
{
if (current_char == '\n')
@@ -1611,6 +1633,7 @@ Lexer::parse_partial_unicode_escape ()
// wrong bracketm whitespace or single/double quotes are wrong
// termination, otherwise it is a wrong character, then skip to the actual
// terminator.
+ // TODO use utf-8 codepoint to skip whitespaces
if (current_char == '{' || is_whitespace (current_char)
|| current_char == '\'' || current_char == '"')
{
@@ -1623,6 +1646,7 @@ Lexer::parse_partial_unicode_escape ()
rust_error_at (get_current_location (),
"invalid character %<%c%> in unicode escape",
current_char);
+ // TODO use utf-8 codepoint to skip whitespaces
while (current_char != '}' && current_char != '{'
&& !is_whitespace (current_char) && current_char != '\''
&& current_char != '"')
@@ -1905,8 +1929,7 @@ Lexer::parse_raw_identifier (Location loc)
int length = 0;
current_char = peek_input ();
// loop through entire name
- while (ISALPHA (current_char) || ISDIGIT (current_char)
- || current_char == '_')
+ while (is_identifier_continue (current_char))
{
length++;
@@ -2042,21 +2065,22 @@ Lexer::parse_identifier_or_keyword (Location loc)
{
std::string str;
str.reserve (16); // default
- str += current_char;
+ str += current_char32.as_string ();
bool first_is_underscore = current_char == '_';
int length = 1;
- current_char = peek_input ();
+ current_char32 = peek_codepoint_input ();
+
// loop through entire name
- while (ISALPHA (current_char) || ISDIGIT (current_char)
- || current_char == '_')
+ while (is_identifier_continue (current_char32.value))
{
+ auto s = current_char32.as_string ();
length++;
- str += current_char;
- skip_input ();
- current_char = peek_input ();
+ str += current_char32.as_string ();
+ skip_codepoint_input ();
+ current_char32 = peek_codepoint_input ();
}
current_column += length;
@@ -2444,21 +2468,19 @@ Lexer::parse_char_or_lifetime (Location loc)
return Token::make_char (loc, current_char32);
}
- else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value)
- || current_char32.value == '_')
+ else if (is_identifier_start (current_char32.value))
{
// parse lifetime name
std::string str;
str += current_char32;
length++;
- current_char = peek_input ();
- while (ISDIGIT (current_char) || ISALPHA (current_char)
- || current_char == '_')
+ current_char32 = peek_codepoint_input ();
+ while (is_identifier_continue (current_char32.value))
{
- str += current_char;
- skip_input ();
- current_char = peek_input ();
+ str += current_char32;
+ skip_codepoint_input ();
+ current_char32 = peek_codepoint_input ();
length++;
}
@@ -2466,6 +2488,9 @@ Lexer::parse_char_or_lifetime (Location loc)
loc += length - 1;
+ // TODO some keywords cannot be used for a lifetime label #2306
+ // https://doc.rust-lang.org/reference/tokens.html
+
str.shrink_to_fit ();
return Token::make_lifetime (loc, std::move (str));
}
@@ -2637,6 +2662,8 @@ Lexer::peek_codepoint_input ()
void
Lexer::skip_codepoint_input ()
{
+ if (peek_input () == EOF)
+ return;
int toSkip = get_input_codepoint_length ();
gcc_assert (toSkip >= 1);
diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h
index e0b98e7bdd6..a05e8fcbfe1 100644
--- a/gcc/rust/lex/rust-lex.h
+++ b/gcc/rust/lex/rust-lex.h
@@ -118,9 +118,9 @@ private:
// Advances current input char to n + 1 chars ahead of current position.
void skip_input (int n);
- // Returns char n chars ahead of current position.
- int peek_input ();
// Peeks the current char.
+ int peek_input ();
+ // Returns char n bytes ahead of current position.
int peek_input (int n);
// Classifies keyword (i.e. gets id for keyword).
@@ -137,6 +137,7 @@ private:
int get_input_codepoint_length ();
int test_get_input_codepoint_n_length (int n_start_offset);
+ // Peeks the current utf-8 char
Codepoint peek_codepoint_input ();
Codepoint test_peek_codepoint_input (int n);
void skip_codepoint_input ();
@@ -220,6 +221,7 @@ private:
int current_column;
// Current character.
int current_char;
+ Codepoint current_char32;
// Line map.
Linemap *line_map;
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2024-01-16 17:52 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-16 17:52 [gcc r14-7676] gccrs: tokenize Unicode identifiers Arthur Cohen
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).