From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 7905) id 0F3E93858285; Tue, 16 Jan 2024 17:52:51 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 0F3E93858285 DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1705427571; bh=hfVZXLkZ5tO7ljoaVTjcjhbtj1kxwIwE8TjOFbOfco4=; h=From:To:Subject:Date:From; b=SslIZqZERyI9a6uXbzQWv6oe+wOW4PeeULhm9cyAAPBqb6n5MtOgr28ufFqd0KnMN AJ/F5NffFfZX2ssLyR+4FBQ0WH/yXLCFwxxhkwe0+aEZe5M0oNy4zGo7Gl3Sn8JNnc iRDQstqM+rzc3IRcQ/E+EPEci+SeHTcgaQM/VH74= MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: Arthur Cohen To: gcc-cvs@gcc.gnu.org Subject: [gcc r14-7676] gccrs: tokenize Unicode identifiers X-Act-Checkin: gcc X-Git-Author: Raiki Tamura X-Git-Refname: refs/heads/trunk X-Git-Oldrev: 45c01fae2a2afb30b0ca933e9d7b807acb167d23 X-Git-Newrev: 61644aea34c4623d16273ff705f8b8b1ff2d87f0 Message-Id: <20240116175251.0F3E93858285@sourceware.org> Date: Tue, 16 Jan 2024 17:52:51 +0000 (GMT) List-Id: https://gcc.gnu.org/g:61644aea34c4623d16273ff705f8b8b1ff2d87f0 commit r14-7676-g61644aea34c4623d16273ff705f8b8b1ff2d87f0 Author: Raiki Tamura Date: Mon Jun 19 18:06:11 2023 +0900 gccrs: tokenize Unicode identifiers gcc/rust/ChangeLog: * lex/rust-lex.cc (is_whitespace):add all lacked codepoints valid as whitespaces (is_identifier_start):new function to check XID_Start and underscore (is_identifier_continue):new function to check XID_Continue (Lexer::build_token):tokenize Unicode identifiers (Lexer::parse_partial_string_continue):add comments (Lexer::parse_partial_unicode_escape):add comments (Lexer::parse_raw_identifier):change to use `is_identifier_scontinue` (Lexer::parse_identifier_or_keyword):change to use `is_identifier_continue` (Lexer::parse_char_or_lifetime):change to use `is_identifier_start/continue` (Lexer::skip_codepoint_input):do not attempt to skip input when bumping EOF * lex/rust-lex.h:add `current_char32` field Signed-off-by: Raiki Tamura Diff: --- gcc/rust/lex/rust-lex.cc | 87 +++++++++++++++++++++++++++++++----------------- gcc/rust/lex/rust-lex.h | 6 ++-- 2 files changed, 61 insertions(+), 32 deletions(-) diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc index 23579e51944..94ec67d2e66 100644 --- a/gcc/rust/lex/rust-lex.cc +++ b/gcc/rust/lex/rust-lex.cc @@ -22,6 +22,7 @@ #include "rust-linemap.h" #include "rust-session-manager.h" #include "safe-ctype.h" +#include "cpplib.h" namespace Rust { // TODO: move to separate compilation unit? @@ -103,11 +104,17 @@ check_valid_float_dot_end (char character) return character != '.' && character != '_' && !ISALPHA (character); } -// ISSPACE from safe-ctype but may change in future bool -is_whitespace (char character) +is_whitespace (int character) { - return ISSPACE (character); + // https://doc.rust-lang.org/reference/whitespace.html + return character == '\t' || character == '\n' || character == '\v' + || character == '\f' || character == '\r' || character == ' ' + || character == 0x0085 // next line + || character == 0x200e // left-to-right mark + || character == 0x200f // right-to-left mark + || character == 0x2028 // line separator + || character == 0x2029; // pragraph separator } bool @@ -116,6 +123,18 @@ is_non_decimal_int_literal_separator (char character) return character == 'x' || character == 'o' || character == 'b'; } +bool +is_identifier_start (int codepoint) +{ + return (cpp_check_xid_property (codepoint) & CPP_XID_START) || codepoint == '_'; +} + +bool +is_identifier_continue (int codepoint) +{ + return cpp_check_xid_property (codepoint) & CPP_XID_CONTINUE; +} + Lexer::Lexer (const std::string &input) : input (RAIIFile::create_error ()), current_line (1), current_column (1), line_map (nullptr), dump_lex_out (Optional::none ()), @@ -284,22 +303,22 @@ Lexer::build_token () while (true) { Location loc = get_current_location (); - current_char = peek_input (); - skip_input (); // detect UTF8 bom // // Must be the first thing on the first line. // There might be an optional BOM (Byte Order Mark), which for UTF-8 is // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped. - if (current_line == 1 && current_column == 1 && current_char == 0xef - && peek_input () == 0xbb && peek_input (1) == 0xbf) + if (current_line == 1 && current_column == 1 && peek_input () == 0xef + && peek_input (1) == 0xbb && peek_input (2) == 0xbf) { - skip_input (1); - current_char = peek_input (); - skip_input (); + skip_input (2); } + current_char = peek_input (); + current_char32 = peek_codepoint_input (); + skip_codepoint_input (); + // detect shebang // Must be the first thing on the first line, starting with #! // But since an attribute can also start with an #! we don't count it as a @@ -312,6 +331,7 @@ Lexer::build_token () int n = 1; while (true) { + // TODO use utf-8 codepoint to skip whitespaces int next_char = peek_input (n); if (is_whitespace (next_char)) n++; @@ -1052,7 +1072,8 @@ Lexer::build_token () int peek = peek_input (); int peek1 = peek_input (1); - if (peek == '#' && (ISALPHA (peek1) || peek1 == '_')) + // TODO (tamaron) parse Unicode ident + if (peek == '#' && is_identifier_start (peek1)) { TokenPtr raw_ident_ptr = parse_raw_identifier (loc); if (raw_ident_ptr != nullptr) @@ -1069,8 +1090,8 @@ Lexer::build_token () } } - // find identifiers and keywords - if (ISALPHA (current_char) || current_char == '_') + // find identifiers and keywords. + if (is_identifier_start (current_char32.value)) return parse_identifier_or_keyword (loc); // int and float literals @@ -1468,6 +1489,7 @@ Lexer::parse_partial_string_continue () int additional_length_offset = 1; // string continue + // TODO use utf-8 codepoint to skip whitespaces while (is_whitespace (current_char)) { if (current_char == '\n') @@ -1611,6 +1633,7 @@ Lexer::parse_partial_unicode_escape () // wrong bracketm whitespace or single/double quotes are wrong // termination, otherwise it is a wrong character, then skip to the actual // terminator. + // TODO use utf-8 codepoint to skip whitespaces if (current_char == '{' || is_whitespace (current_char) || current_char == '\'' || current_char == '"') { @@ -1623,6 +1646,7 @@ Lexer::parse_partial_unicode_escape () rust_error_at (get_current_location (), "invalid character %<%c%> in unicode escape", current_char); + // TODO use utf-8 codepoint to skip whitespaces while (current_char != '}' && current_char != '{' && !is_whitespace (current_char) && current_char != '\'' && current_char != '"') @@ -1905,8 +1929,7 @@ Lexer::parse_raw_identifier (Location loc) int length = 0; current_char = peek_input (); // loop through entire name - while (ISALPHA (current_char) || ISDIGIT (current_char) - || current_char == '_') + while (is_identifier_continue (current_char)) { length++; @@ -2042,21 +2065,22 @@ Lexer::parse_identifier_or_keyword (Location loc) { std::string str; str.reserve (16); // default - str += current_char; + str += current_char32.as_string (); bool first_is_underscore = current_char == '_'; int length = 1; - current_char = peek_input (); + current_char32 = peek_codepoint_input (); + // loop through entire name - while (ISALPHA (current_char) || ISDIGIT (current_char) - || current_char == '_') + while (is_identifier_continue (current_char32.value)) { + auto s = current_char32.as_string (); length++; - str += current_char; - skip_input (); - current_char = peek_input (); + str += current_char32.as_string (); + skip_codepoint_input (); + current_char32 = peek_codepoint_input (); } current_column += length; @@ -2444,21 +2468,19 @@ Lexer::parse_char_or_lifetime (Location loc) return Token::make_char (loc, current_char32); } - else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value) - || current_char32.value == '_') + else if (is_identifier_start (current_char32.value)) { // parse lifetime name std::string str; str += current_char32; length++; - current_char = peek_input (); - while (ISDIGIT (current_char) || ISALPHA (current_char) - || current_char == '_') + current_char32 = peek_codepoint_input (); + while (is_identifier_continue (current_char32.value)) { - str += current_char; - skip_input (); - current_char = peek_input (); + str += current_char32; + skip_codepoint_input (); + current_char32 = peek_codepoint_input (); length++; } @@ -2466,6 +2488,9 @@ Lexer::parse_char_or_lifetime (Location loc) loc += length - 1; + // TODO some keywords cannot be used for a lifetime label #2306 + // https://doc.rust-lang.org/reference/tokens.html + str.shrink_to_fit (); return Token::make_lifetime (loc, std::move (str)); } @@ -2637,6 +2662,8 @@ Lexer::peek_codepoint_input () void Lexer::skip_codepoint_input () { + if (peek_input () == EOF) + return; int toSkip = get_input_codepoint_length (); gcc_assert (toSkip >= 1); diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h index e0b98e7bdd6..a05e8fcbfe1 100644 --- a/gcc/rust/lex/rust-lex.h +++ b/gcc/rust/lex/rust-lex.h @@ -118,9 +118,9 @@ private: // Advances current input char to n + 1 chars ahead of current position. void skip_input (int n); - // Returns char n chars ahead of current position. - int peek_input (); // Peeks the current char. + int peek_input (); + // Returns char n bytes ahead of current position. int peek_input (int n); // Classifies keyword (i.e. gets id for keyword). @@ -137,6 +137,7 @@ private: int get_input_codepoint_length (); int test_get_input_codepoint_n_length (int n_start_offset); + // Peeks the current utf-8 char Codepoint peek_codepoint_input (); Codepoint test_peek_codepoint_input (int n); void skip_codepoint_input (); @@ -220,6 +221,7 @@ private: int current_column; // Current character. int current_char; + Codepoint current_char32; // Line map. Linemap *line_map;