From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-ej1-x62f.google.com (mail-ej1-x62f.google.com [IPv6:2a00:1450:4864:20::62f]) by sourceware.org (Postfix) with ESMTPS id 13E753857409; Wed, 14 Sep 2022 13:30:54 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 13E753857409 Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=gmail.com Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=gmail.com Received: by mail-ej1-x62f.google.com with SMTP id sb3so5084445ejb.9; Wed, 14 Sep 2022 06:30:54 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112; h=cc:to:subject:message-id:date:from:in-reply-to:references :mime-version:from:to:cc:subject:date; bh=//XRM8cAZdizf2pzJ37kymktBlSheAHP/Rjta0tdINg=; b=Vy1bRaFtN0NOMZS5eF1D6ucjHsyZ5oYI0rqrpbuIJWK3feAjgZXw0FiBwye2ru//W2 EHRtF7tBi9vq+K1mRrFu/Ke0O9/+zMPNxgmyzo++sAa9Iy5ZeR9IJhH1FrXyvqSb9cXK qZ1YsbdT9Eca6qNW0qbmHyzEepED9i46+HIpUQKpjMd1cbl/fSfgl50piwpEkeBCqmRS NPK6+utAHAAMrTaOD6b4e1DpEuX7cWYjyabMm770urZ9zM8p7/iFUolvgyZwGZ6yu04X U2UBA84ziyxgZ2HiRsls7VdqkfSliRbIYv5FLUHFvkfKre+ZBXw3vubehzRbUY6teIMZ w8ag== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=cc:to:subject:message-id:date:from:in-reply-to:references :mime-version:x-gm-message-state:from:to:cc:subject:date; bh=//XRM8cAZdizf2pzJ37kymktBlSheAHP/Rjta0tdINg=; b=WqAwObDXQkjQtHER6I7Uo/Eprkg5kOieZYw2NB5Q03PqGlNKQLb9GkFz1VzIRwiNVd sTqG78QCTbxreWnCOK2mkE64hcxd1x2WUIycJxrOtlUG9cGgOY4GNhjYXTYa9UPej7AX 8dhKW8c7GYUxadoinvmMBWvgrxbTze00TincST1XgoNvkMspxIoBD7jujSsPLPMvVanq vzQ2iPUw5C3RL7sPjOb/FPg09oMGgBwjjiP81sZW51x3AUG9sKtadty/KbwwG/RgSt7D cb8mpXIMHOeTH9VbpqbP05uPMQ+G0kGVy+a3dLvTFvLlmM2aerO49XDpP0uWh8lQK4Vq HEzA== X-Gm-Message-State: ACgBeo3PcSz438Wpql/bOE3zBdlO5XUfAvwrmfVwJms3Ao9M5A273nRz tHipFurlWvE4zpgbUB+fyf5t/XzaAwmlLwRZms8= X-Google-Smtp-Source: AA6agR5BZTPBU3xJFXxhe42w4V7WQBQkMGI3Z4F/G+UiXvU7Ui/olpXUZDOhNXeXbuht0wmwqk/zO76B8OTcsEParMQ= X-Received: by 2002:a17:907:971e:b0:77b:b0f3:6473 with SMTP id jg30-20020a170907971e00b0077bb0f36473mr13796584ejc.754.1663162251844; Wed, 14 Sep 2022 06:30:51 -0700 (PDT) MIME-Version: 1.0 References: <20220824115956.737931-1-philip.herron@embecosm.com> <20220824115956.737931-10-philip.herron@embecosm.com> In-Reply-To: <20220824115956.737931-10-philip.herron@embecosm.com> From: Richard Biener Date: Wed, 14 Sep 2022 15:30:39 +0200 Message-ID: Subject: Re: [PATCH Rust front-end v2 09/37] gccrs: Add Lexer for Rust front-end To: philip.herron@embecosm.com Cc: gcc-patches@gcc.gnu.org, The Other , Arthur Cohen , Mark Wielaard , gcc-rust@gcc.gnu.org Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-8.2 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,FREEMAIL_FROM,GIT_PATCH_0,KAM_SHORT,RCVD_IN_DNSWL_NONE,SPF_HELO_NONE,SPF_PASS,TXREP,T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: On Wed, Aug 24, 2022 at 2:04 PM wrote: > > From: The Other > > The lexer is refered to as a ManagedTokenSource within the parser, this > lexer does not currently support unicode but serves as a starting point > to do so. > > Co-authored-by: Philip Herron > Co-authored-by: Arthur Cohen > Co-authored-by: Mark Wielaard > --- > gcc/rust/lex/rust-codepoint.h | 46 + > gcc/rust/lex/rust-lex.cc | 2729 ++++++++++++++++++++++++++++++++ > gcc/rust/lex/rust-lex.h | 271 ++++ > gcc/rust/lex/rust-token.cc | 135 ++ > gcc/rust/lex/rust-token.h | 455 ++++++ > gcc/rust/rust-buffered-queue.h | 204 +++ > 6 files changed, 3840 insertions(+) > create mode 100644 gcc/rust/lex/rust-codepoint.h > create mode 100644 gcc/rust/lex/rust-lex.cc > create mode 100644 gcc/rust/lex/rust-lex.h > create mode 100644 gcc/rust/lex/rust-token.cc > create mode 100644 gcc/rust/lex/rust-token.h > create mode 100644 gcc/rust/rust-buffered-queue.h > > diff --git a/gcc/rust/lex/rust-codepoint.h b/gcc/rust/lex/rust-codepoint.h > new file mode 100644 > index 00000000000..22da080bbb2 > --- /dev/null > +++ b/gcc/rust/lex/rust-codepoint.h > @@ -0,0 +1,46 @@ > +// Copyright (C) 2020-2022 Free Software Foundation, Inc. > + > +// This file is part of GCC. > + > +// GCC is free software; you can redistribute it and/or modify it under > +// the terms of the GNU General Public License as published by the Free > +// Software Foundation; either version 3, or (at your option) any later > +// version. > + > +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY > +// WARRANTY; without even the implied warranty of MERCHANTABILITY or > +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License > +// for more details. > + > +// You should have received a copy of the GNU General Public License > +// along with GCC; see the file COPYING3. If not see > +// . > + > +#ifndef RUST_CODEPOINT_H > +#define RUST_CODEPOINT_H > + > +#include > + > +namespace Rust { > +struct Codepoint > +{ > + uint32_t value; > + > + // Creates a zero codepoint. > + Codepoint () : value (0) {} > + > + // Creates a codepoint from an encoded UTF-8 value. > + Codepoint (uint32_t value) : value (value) {} > + > + static Codepoint eof () { return Codepoint (UINT32_MAX); } > + bool is_eof () const { return value == UINT32_MAX; } > + > + // Returns a C++ string containing string value of codepoint. > + std::string as_string (); > + > + bool operator== (Codepoint other) const { return value == other.value; } > + bool operator!= (Codepoint other) const { return !operator== (other); } > +}; > +} // namespace Rust > + > +#endif > diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc > new file mode 100644 > index 00000000000..70e6b50209f > --- /dev/null > +++ b/gcc/rust/lex/rust-lex.cc > @@ -0,0 +1,2729 @@ > +// Copyright (C) 2020-2022 Free Software Foundation, Inc. > + > +// This file is part of GCC. > + > +// GCC is free software; you can redistribute it and/or modify it under > +// the terms of the GNU General Public License as published by the Free > +// Software Foundation; either version 3, or (at your option) any later > +// version. > + > +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY > +// WARRANTY; without even the implied warranty of MERCHANTABILITY or > +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License > +// for more details. > + > +// You should have received a copy of the GNU General Public License > +// along with GCC; see the file COPYING3. If not see > +// . > + > +#include "rust-lex.h" > + > +#include "rust-system.h" // for rust_assert and rust_unreachable > +#include "rust-diagnostics.h" // for rust_error_at > +#include "rust-linemap.h" > +#include "rust-session-manager.h" > +#include "safe-ctype.h" just diving into a random patch here - I'm assuming I can take rust-lex.cc as a boiler-plate example for the #include structure. In GCC all files should start with #including "config.h" followed by "system.h" where _all_ system, including C++ standard library headers should be pulled via system.h to allow working around OS and system compiler issues. It might be that rust-system.h plays the role of config.h + system.h but then the rust-lex.h include is before it. rust-codepoint.h including is also problematic btw. Richard. > +namespace Rust { > +// TODO: move to separate compilation unit? > +// overload += for uint32_t to allow 32-bit encoded utf-8 to be added > +std::string & > +operator+= (std::string &str, Codepoint char32) > +{ > + if (char32.value < 0x80) > + { > + str += static_cast (char32.value); > + } > + else if (char32.value < (0x1F + 1) << (1 * 6)) > + { > + str += static_cast (0xC0 | ((char32.value >> 6) & 0x1F)); > + str += static_cast (0x80 | ((char32.value >> 0) & 0x3F)); > + } > + else if (char32.value < (0x0F + 1) << (2 * 6)) > + { > + str += static_cast (0xE0 | ((char32.value >> 12) & 0x0F)); > + str += static_cast (0x80 | ((char32.value >> 6) & 0x3F)); > + str += static_cast (0x80 | ((char32.value >> 0) & 0x3F)); > + } > + else if (char32.value < (0x07 + 1) << (3 * 6)) > + { > + str += static_cast (0xF0 | ((char32.value >> 18) & 0x07)); > + str += static_cast (0x80 | ((char32.value >> 12) & 0x3F)); > + str += static_cast (0x80 | ((char32.value >> 6) & 0x3F)); > + str += static_cast (0x80 | ((char32.value >> 0) & 0x3F)); > + } > + else > + { > + rust_debug ("Invalid unicode codepoint found: '%u' ", char32.value); > + } > + return str; > +} > + > +std::string > +Codepoint::as_string () > +{ > + std::string str; > + > + // str += Codepoint (value); > + str += *this; > + > + return str; > +} > + > +/* Includes all allowable float digits EXCEPT _ and . as that needs lookahead > + * for handling. */ > +bool > +is_float_digit (char number) > +{ > + return ISDIGIT (number) || number == 'E' || number == 'e'; > +} > + > +/* Basically ISXDIGIT from safe-ctype but may change if Rust's encoding or > + * whatever is different */ > +bool > +is_x_digit (char number) > +{ > + return ISXDIGIT (number); > +} > + > +bool > +is_octal_digit (char number) > +{ > + return number >= '0' && number <= '7'; > +} > + > +bool > +is_bin_digit (char number) > +{ > + return number == '0' || number == '1'; > +} > + > +bool > +check_valid_float_dot_end (char character) > +{ > + return character != '.' && character != '_' && !ISALPHA (character); > +} > + > +// ISSPACE from safe-ctype but may change in future > +bool > +is_whitespace (char character) > +{ > + return ISSPACE (character); > +} > + > +bool > +is_non_decimal_int_literal_separator (char character) > +{ > + return character == 'x' || character == 'o' || character == 'b'; > +} > + > +Lexer::Lexer (const std::string &input) > + : input (RAIIFile::create_error ()), current_line (1), current_column (1), > + line_map (nullptr), raw_input_source (new BufferInputSource (input, 0)), > + input_queue{*raw_input_source}, token_queue (TokenSource (this)) > +{} > + > +Lexer::Lexer (const char *filename, RAIIFile file_input, Linemap *linemap) > + : input (std::move (file_input)), current_line (1), current_column (1), > + line_map (linemap), > + raw_input_source (new FileInputSource (input.get_raw ())), > + input_queue{*raw_input_source}, token_queue (TokenSource (this)) > +{ > + // inform line_table that file is being entered and is in line 1 > + if (linemap) > + line_map->start_file (filename, current_line); > +} > + > +Lexer::~Lexer () > +{ > + /* ok apparently stop (which is equivalent of original code in destructor) is > + * meant to be called after all files have finished parsing, for cleanup. On > + * the other hand, actual code that it calls to leave a certain line map is > + * mentioned in GCC docs as being useful for "just leaving an included header" > + * and stuff like that, so this line mapping functionality may need fixing. > + * FIXME: find out whether this occurs. */ > + > + // line_map->stop(); > +} > + > +/* TODO: need to optimise somehow to avoid the virtual function call in the > + * tight loop. Best idea at the moment is CRTP, but that might make lexer > + * implementation annoying when storing the "base class" (i.e. would need > + * template parameter everywhere), although in practice it would mostly just > + * look ugly and make enclosing classes like Parser also require a type > + * parameter. At this point a macro might be better. OK I guess macros can be > + * replaced by constexpr if or something if possible. */ > +Location > +Lexer::get_current_location () > +{ > + if (line_map) > + return line_map->get_location (current_column); > + else > + // If we have no linemap, we're lexing something without proper locations > + return Location (); > +} > + > +int > +Lexer::peek_input (int n) > +{ > + return input_queue.peek (n); > +} > + > +int > +Lexer::peek_input () > +{ > + return peek_input (0); > +} > + > +void > +Lexer::skip_input (int n) > +{ > + input_queue.skip (n); > +} > + > +void > +Lexer::skip_input () > +{ > + skip_input (0); > +} > + > +void > +Lexer::replace_current_token (TokenPtr replacement) > +{ > + token_queue.replace_current_value (replacement); > + > + rust_debug ("called 'replace_current_token' - this is deprecated"); > +} > + > +/* shitty anonymous namespace that can only be accessed inside the compilation > + * unit - used for classify_keyword binary search in sorted array of keywords > + * created with x-macros. */ > +namespace { > +// TODO: make constexpr when update to c++20 > +const std::string keyword_index[] = { > +#define RS_TOKEN(x, y) > +#define RS_TOKEN_KEYWORD(name, keyword) keyword, > + RS_TOKEN_LIST > +#undef RS_TOKEN_KEYWORD > +#undef RS_TOKEN > +}; > + > +constexpr TokenId keyword_keys[] = { > +#define RS_TOKEN(x, y) > +#define RS_TOKEN_KEYWORD(name, keyword) name, > + RS_TOKEN_LIST > +#undef RS_TOKEN_KEYWORD > +#undef RS_TOKEN > +}; > + > +constexpr int num_keywords = sizeof (keyword_index) / sizeof (*keyword_index); > +} // namespace > + > +/* Determines whether the string passed in is a keyword or not. If it is, it > + * returns the keyword name. */ > +TokenId > +Lexer::classify_keyword (const std::string &str) > +{ > + const std::string *last = keyword_index + num_keywords; > + const std::string *idx = std::lower_bound (keyword_index, last, str); > + > + if (idx == last || str != *idx) > + return IDENTIFIER; > + > + // TODO: possibly replace this x-macro system with something like hash map? > + > + // We now have the expected token ID of the reserved keyword. However, some > + // keywords are reserved starting in certain editions. For example, `try` is > + // only a reserved keyword in editions >=2018. The language might gain new > + // reserved keywords in the future. > + // > + // https://doc.rust-lang.org/reference/keywords.html#reserved-keywords > + auto id = keyword_keys[idx - keyword_index]; > + > + // `try` is not a reserved keyword before 2018 > + if (Session::get_instance ().options.get_edition () > + == CompileOptions::Edition::E2015 > + && id == TRY) > + return IDENTIFIER; > + > + return id; > +} > + > +TokenPtr > +Lexer::build_token () > +{ > + // loop to go through multiple characters to build a single token > + while (true) > + { > + Location loc = get_current_location (); > + current_char = peek_input (); > + skip_input (); > + > + // detect UTF8 bom > + // > + // Must be the first thing on the first line. > + // There might be an optional BOM (Byte Order Mark), which for UTF-8 is > + // the three bytes 0xEF, 0xBB and 0xBF. These can simply be skipped. > + if (current_line == 1 && current_column == 1 && current_char == 0xef > + && peek_input () == 0xbb && peek_input (1) == 0xbf) > + { > + skip_input (1); > + current_char = peek_input (); > + skip_input (); > + } > + > + // detect shebang > + // Must be the first thing on the first line, starting with #! > + // But since an attribute can also start with an #! we don't count it as a > + // shebang line when after any whitespace or comments there is a [. If it > + // is a shebang line we simple drop the line. Otherwise we don't consume > + // any characters and fall through to the real tokenizer. > + if (current_line == 1 && current_column == 1 && current_char == '#' > + && peek_input () == '!') > + { > + int n = 1; > + while (true) > + { > + int next_char = peek_input (n); > + if (is_whitespace (next_char)) > + n++; > + else if ((next_char == '/' && peek_input (n + 1) == '/' > + && peek_input (n + 2) != '!' > + && peek_input (n + 2) != '/') > + || (next_char == '/' && peek_input (n + 1) == '/' > + && peek_input (n + 2) == '/' > + && peek_input (n + 3) == '/')) > + { > + // two // or four //// > + // A single line comment > + // (but not an inner or outer doc comment) > + n += 2; > + next_char = peek_input (n); > + while (next_char != '\n' && next_char != EOF) > + { > + n++; > + next_char = peek_input (n); > + } > + if (next_char == '\n') > + n++; > + } > + else if (next_char == '/' && peek_input (n + 1) == '*' > + && peek_input (n + 2) == '*' > + && peek_input (n + 3) == '/') > + { > + /**/ > + n += 4; > + } > + else if (next_char == '/' && peek_input (n + 1) == '*' > + && peek_input (n + 2) == '*' && peek_input (n + 3) == '*' > + && peek_input (n + 4) == '/') > + { > + /***/ > + n += 5; > + } > + else if ((next_char == '/' && peek_input (n + 1) == '*' > + && peek_input (n + 2) != '*' > + && peek_input (n + 2) != '!') > + || (next_char == '/' && peek_input (n + 1) == '*' > + && peek_input (n + 2) == '*' > + && peek_input (n + 3) == '*')) > + { > + // one /* or three /*** > + // Start of a block comment > + // (but not an inner or outer doc comment) > + n += 2; > + int level = 1; > + while (level > 0) > + { > + if (peek_input (n) == EOF) > + break; > + else if (peek_input (n) == '/' > + && peek_input (n + 1) == '*') > + { > + n += 2; > + level += 1; > + } > + else if (peek_input (n) == '*' > + && peek_input (n + 1) == '/') > + { > + n += 2; > + level -= 1; > + } > + else > + n++; > + } > + } > + else if (next_char != '[') > + { > + // definitely shebang, ignore the first line > + while (current_char != '\n' && current_char != EOF) > + { > + current_char = peek_input (); > + skip_input (); > + } > + > + // newline > + current_line++; > + current_column = 1; > + // tell line_table that new line starts > + start_line (current_line, max_column_hint); > + break; > + } > + else > + break; /* Definitely not a shebang line. */ > + } > + } > + > + // return end of file token if end of file > + if (current_char == EOF) > + return Token::make (END_OF_FILE, loc); > + > + // if not end of file, start tokenising > + switch (current_char) > + { > + /* ignore whitespace characters for tokens but continue updating > + * location */ > + case '\n': // newline > + current_line++; > + current_column = 1; > + // tell line_table that new line starts > + start_line (current_line, max_column_hint); > + continue; > + case '\r': // cr > + // Ignore, we expect a newline (lf) soon. > + continue; > + case ' ': // space > + current_column++; > + continue; > + case '\t': // tab > + // width of a tab is not well-defined, assume 8 spaces > + current_column += 8; > + continue; > + > + // punctuation - actual tokens > + case '=': > + if (peek_input () == '>') > + { > + // match arm arrow > + skip_input (); > + current_column += 2; > + > + return Token::make (MATCH_ARROW, loc); > + } > + else if (peek_input () == '=') > + { > + // equality operator > + skip_input (); > + current_column += 2; > + > + return Token::make (EQUAL_EQUAL, loc); > + } > + else > + { > + // assignment operator > + current_column++; > + return Token::make (EQUAL, loc); > + } > + case '(': > + current_column++; > + return Token::make (LEFT_PAREN, loc); > + case '-': > + if (peek_input () == '>') > + { > + // return type specifier > + skip_input (); > + current_column += 2; > + > + return Token::make (RETURN_TYPE, loc); > + } > + else if (peek_input () == '=') > + { > + // minus-assign > + skip_input (); > + current_column += 2; > + > + return Token::make (MINUS_EQ, loc); > + } > + else > + { > + // minus > + current_column++; > + return Token::make (MINUS, loc); > + } > + case '+': > + if (peek_input () == '=') > + { > + // add-assign > + skip_input (); > + current_column += 2; > + > + return Token::make (PLUS_EQ, loc); > + } > + else > + { > + // add > + current_column++; > + return Token::make (PLUS, loc); > + } > + case ')': > + current_column++; > + return Token::make (RIGHT_PAREN, loc); > + case ';': > + current_column++; > + return Token::make (SEMICOLON, loc); > + case '*': > + if (peek_input () == '=') > + { > + // multiplication-assign > + skip_input (); > + current_column += 2; > + > + return Token::make (ASTERISK_EQ, loc); > + } > + else > + { > + // multiplication > + current_column++; > + return Token::make (ASTERISK, loc); > + } > + case ',': > + current_column++; > + return Token::make (COMMA, loc); > + case '/': > + if (peek_input () == '=') > + { > + // division-assign > + skip_input (); > + current_column += 2; > + > + return Token::make (DIV_EQ, loc); > + } > + else if ((peek_input () == '/' && peek_input (1) != '!' > + && peek_input (1) != '/') > + || (peek_input () == '/' && peek_input (1) == '/' > + && peek_input (2) == '/')) > + { > + // two // or four //// > + // single line comment > + // (but not an inner or outer doc comment) > + skip_input (); > + current_column += 2; > + current_char = peek_input (); > + > + // basically ignore until line finishes > + while (current_char != '\n' && current_char != EOF) > + { > + skip_input (); > + current_column++; // not used > + current_char = peek_input (); > + } > + continue; > + } > + else if (peek_input () == '/' > + && (peek_input (1) == '!' || peek_input (1) == '/')) > + { > + /* single line doc comment, inner or outer. */ > + bool is_inner = peek_input (1) == '!'; > + skip_input (1); > + current_column += 3; > + > + std::string str; > + str.reserve (32); > + current_char = peek_input (); > + while (current_char != '\n') > + { > + skip_input (); > + if (current_char == '\r') > + { > + char next_char = peek_input (); > + if (next_char == '\n') > + { > + current_char = '\n'; > + break; > + } > + rust_error_at ( > + loc, "Isolated CR %<\\r%> not allowed in doc comment"); > + current_char = next_char; > + continue; > + } > + if (current_char == EOF) > + { > + rust_error_at ( > + loc, "unexpected EOF while looking for end of comment"); > + break; > + } > + str += current_char; > + current_char = peek_input (); > + } > + skip_input (); > + current_line++; > + current_column = 1; > + // tell line_table that new line starts > + start_line (current_line, max_column_hint); > + > + str.shrink_to_fit (); > + if (is_inner) > + return Token::make_inner_doc_comment (loc, std::move (str)); > + else > + return Token::make_outer_doc_comment (loc, std::move (str)); > + } > + else if (peek_input () == '*' && peek_input (1) == '*' > + && peek_input (2) == '/') > + { > + /**/ > + skip_input (2); > + current_column += 4; > + continue; > + } > + else if (peek_input () == '*' && peek_input (1) == '*' > + && peek_input (2) == '*' && peek_input (3) == '/') > + { > + /***/ > + skip_input (3); > + current_column += 5; > + continue; > + } > + else if ((peek_input () == '*' && peek_input (1) != '!' > + && peek_input (1) != '*') > + || (peek_input () == '*' && peek_input (1) == '*' > + && peek_input (2) == '*')) > + { > + // one /* or three /*** > + // block comment > + // (but not an inner or outer doc comment) > + skip_input (); > + current_column += 2; > + > + int level = 1; > + while (level > 0) > + { > + current_char = peek_input (); > + > + if (current_char == EOF) > + { > + rust_error_at ( > + loc, "unexpected EOF while looking for end of comment"); > + break; > + } > + > + // if /* found > + if (current_char == '/' && peek_input (1) == '*') > + { > + // skip /* characters > + skip_input (1); > + > + current_column += 2; > + > + level += 1; > + continue; > + } > + > + // ignore until */ is found > + if (current_char == '*' && peek_input (1) == '/') > + { > + // skip */ characters > + skip_input (1); > + > + current_column += 2; > + > + level -= 1; > + continue; > + } > + > + if (current_char == '\n') > + { > + skip_input (); > + current_line++; > + current_column = 1; > + // tell line_table that new line starts > + start_line (current_line, max_column_hint); > + continue; > + } > + > + skip_input (); > + current_column++; > + } > + > + // refresh new token > + continue; > + } > + else if (peek_input () == '*' > + && (peek_input (1) == '!' || peek_input (1) == '*')) > + { > + // block doc comment, inner /*! or outer /** > + bool is_inner = peek_input (1) == '!'; > + skip_input (1); > + current_column += 3; > + > + std::string str; > + str.reserve (96); > + > + int level = 1; > + while (level > 0) > + { > + current_char = peek_input (); > + > + if (current_char == EOF) > + { > + rust_error_at ( > + loc, "unexpected EOF while looking for end of comment"); > + break; > + } > + > + // if /* found > + if (current_char == '/' && peek_input (1) == '*') > + { > + // skip /* characters > + skip_input (1); > + current_column += 2; > + > + level += 1; > + str += "/*"; > + continue; > + } > + > + // ignore until */ is found > + if (current_char == '*' && peek_input (1) == '/') > + { > + // skip */ characters > + skip_input (1); > + current_column += 2; > + > + level -= 1; > + if (level > 0) > + str += "*/"; > + continue; > + } > + > + if (current_char == '\r' && peek_input (1) != '\n') > + rust_error_at ( > + loc, "Isolated CR %<\\r%> not allowed in doc comment"); > + > + if (current_char == '\n') > + { > + skip_input (); > + current_line++; > + current_column = 1; > + // tell line_table that new line starts > + start_line (current_line, max_column_hint); > + str += '\n'; > + continue; > + } > + > + str += current_char; > + skip_input (); > + current_column++; > + } > + > + str.shrink_to_fit (); > + if (is_inner) > + return Token::make_inner_doc_comment (loc, std::move (str)); > + else > + return Token::make_outer_doc_comment (loc, std::move (str)); > + } > + else > + { > + // division > + current_column++; > + return Token::make (DIV, loc); > + } > + case '%': > + if (peek_input () == '=') > + { > + // modulo-assign > + skip_input (); > + current_column += 2; > + > + return Token::make (PERCENT_EQ, loc); > + } > + else > + { > + // modulo > + current_column++; > + return Token::make (PERCENT, loc); > + } > + case '^': > + if (peek_input () == '=') > + { > + // xor-assign? > + skip_input (); > + current_column += 2; > + > + return Token::make (CARET_EQ, loc); > + } > + else > + { > + // xor? > + current_column++; > + return Token::make (CARET, loc); > + } > + case '<': > + if (peek_input () == '<') > + { > + if (peek_input (1) == '=') > + { > + // left-shift assign > + skip_input (1); > + current_column += 3; > + > + return Token::make (LEFT_SHIFT_EQ, loc); > + } > + else > + { > + // left-shift > + skip_input (); > + current_column += 2; > + > + return Token::make (LEFT_SHIFT, loc); > + } > + } > + else if (peek_input () == '=') > + { > + // smaller than or equal to > + skip_input (); > + current_column += 2; > + > + return Token::make (LESS_OR_EQUAL, loc); > + } > + else > + { > + // smaller than > + current_column++; > + return Token::make (LEFT_ANGLE, loc); > + } > + break; > + case '>': > + if (peek_input () == '>') > + { > + if (peek_input (1) == '=') > + { > + // right-shift-assign > + skip_input (1); > + current_column += 3; > + > + return Token::make (RIGHT_SHIFT_EQ, loc); > + } > + else > + { > + // right-shift > + skip_input (); > + current_column += 2; > + > + return Token::make (RIGHT_SHIFT, loc); > + } > + } > + else if (peek_input () == '=') > + { > + // larger than or equal to > + skip_input (); > + current_column += 2; > + > + return Token::make (GREATER_OR_EQUAL, loc); > + } > + else > + { > + // larger than > + current_column++; > + return Token::make (RIGHT_ANGLE, loc); > + } > + case ':': > + if (peek_input () == ':') > + { > + // scope resolution :: > + skip_input (); > + current_column += 2; > + > + return Token::make (SCOPE_RESOLUTION, loc); > + } > + else > + { > + // single colon : > + current_column++; > + return Token::make (COLON, loc); > + } > + case '!': > + // no special handling for macros in lexer? > + if (peek_input () == '=') > + { > + // not equal boolean operator > + skip_input (); > + current_column += 2; > + > + return Token::make (NOT_EQUAL, loc); > + } > + else > + { > + // not equal unary operator > + current_column++; > + > + return Token::make (EXCLAM, loc); > + } > + case '?': > + current_column++; > + return Token::make (QUESTION_MARK, loc); > + case '#': > + current_column++; > + return Token::make (HASH, loc); > + case '[': > + current_column++; > + return Token::make (LEFT_SQUARE, loc); > + case ']': > + current_column++; > + return Token::make (RIGHT_SQUARE, loc); > + case '{': > + current_column++; > + return Token::make (LEFT_CURLY, loc); > + case '}': > + current_column++; > + return Token::make (RIGHT_CURLY, loc); > + case '@': > + current_column++; > + return Token::make (PATTERN_BIND, loc); > + case '$': > + current_column++; > + return Token::make (DOLLAR_SIGN, loc); > + case '~': > + current_column++; > + return Token::make (TILDE, loc); > + case '\\': > + current_column++; > + return Token::make (BACKSLASH, loc); > + case '`': > + current_column++; > + return Token::make (BACKTICK, loc); > + case '|': > + if (peek_input () == '=') > + { > + // bitwise or-assign? > + skip_input (); > + current_column += 2; > + > + return Token::make (PIPE_EQ, loc); > + } > + else if (peek_input () == '|') > + { > + // logical or > + skip_input (); > + current_column += 2; > + > + return Token::make (OR, loc); > + } > + else > + { > + // bitwise or > + current_column++; > + > + return Token::make (PIPE, loc); > + } > + case '&': > + if (peek_input () == '=') > + { > + // bitwise and-assign? > + skip_input (); > + current_column += 2; > + > + return Token::make (AMP_EQ, loc); > + } > + else if (peek_input () == '&') > + { > + // logical and > + skip_input (); > + current_column += 2; > + > + return Token::make (LOGICAL_AND, loc); > + } > + else > + { > + // bitwise and/reference > + current_column++; > + > + return Token::make (AMP, loc); > + } > + case '.': > + if (peek_input () == '.') > + { > + if (peek_input (1) == '.') > + { > + // ellipsis > + skip_input (1); > + current_column += 3; > + > + return Token::make (ELLIPSIS, loc); > + } > + else if (peek_input (1) == '=') > + { > + // ..= > + skip_input (1); > + current_column += 3; > + > + return Token::make (DOT_DOT_EQ, loc); > + } > + else > + { > + // .. > + skip_input (); > + current_column += 2; > + > + return Token::make (DOT_DOT, loc); > + } > + } > + else /*if (!ISDIGIT (peek_input ()))*/ > + { > + // single dot . > + // Only if followed by a non-number - otherwise is float > + // nope, float cannot start with '.'. > + current_column++; > + return Token::make (DOT, loc); > + } > + } > + // TODO: special handling of _ in the lexer? instead of being identifier > + > + // byte character, byte string and raw byte string literals > + if (current_char == 'b') > + { > + if (peek_input () == '\'') > + return parse_byte_char (loc); > + else if (peek_input () == '"') > + return parse_byte_string (loc); > + else if (peek_input () == 'r' > + && (peek_input (1) == '#' || peek_input (1) == '"')) > + return parse_raw_byte_string (loc); > + } > + > + // raw identifiers and raw strings > + if (current_char == 'r') > + { > + int peek = peek_input (); > + int peek1 = peek_input (1); > + > + if (peek == '#' && (ISALPHA (peek1) || peek1 == '_')) > + { > + TokenPtr raw_ident_ptr = parse_raw_identifier (loc); > + if (raw_ident_ptr != nullptr) > + return raw_ident_ptr; > + else > + continue; /* input got parsed, it just wasn't valid. An error > + was produced. */ > + } > + else > + { > + TokenPtr maybe_raw_string_ptr = maybe_parse_raw_string (loc); > + if (maybe_raw_string_ptr != nullptr) > + return maybe_raw_string_ptr; > + } > + } > + > + // find identifiers and keywords > + if (ISALPHA (current_char) || current_char == '_') > + return parse_identifier_or_keyword (loc); > + > + // int and float literals > + if (ISDIGIT (current_char)) > + { // _ not allowed as first char > + if (current_char == '0' > + && is_non_decimal_int_literal_separator (peek_input ())) > + { > + // handle binary, octal, hex literals > + TokenPtr non_dec_int_lit_ptr > + = parse_non_decimal_int_literals (loc); > + if (non_dec_int_lit_ptr != nullptr) > + return non_dec_int_lit_ptr; > + } > + else > + { > + // handle decimals (integer or float) > + TokenPtr decimal_or_float_ptr = parse_decimal_int_or_float (loc); > + if (decimal_or_float_ptr != nullptr) > + return decimal_or_float_ptr; > + } > + } > + > + // string literals > + if (current_char == '"') > + return parse_string (loc); > + > + // char literals and lifetime names > + if (current_char == '\'') > + { > + TokenPtr char_or_lifetime_ptr = parse_char_or_lifetime (loc); > + if (char_or_lifetime_ptr != nullptr) > + return char_or_lifetime_ptr; > + } > + > + // DEBUG: check for specific character problems: > + if (current_char == '0') > + rust_debug ("'0' uncaught before unexpected character"); > + else if (current_char == ']') > + rust_debug ("']' uncaught before unexpected character"); > + else if (current_char == 0x5d) > + rust_debug ("whatever 0x5d is (not '0' or ']') uncaught before " > + "unexpected character"); > + > + // didn't match anything so error > + rust_error_at (loc, "unexpected character %<%x%>", current_char); > + current_column++; > + } > +} > + > +// Parses in a type suffix. > +std::pair > +Lexer::parse_in_type_suffix () > +{ > + std::string suffix; > + suffix.reserve (5); > + > + int additional_length_offset = 0; > + > + // get suffix > + while (ISALPHA (current_char) || ISDIGIT (current_char) > + || current_char == '_') > + { > + if (current_char == '_') > + { > + // don't add _ to suffix > + skip_input (); > + current_char = peek_input (); > + > + additional_length_offset++; > + > + continue; > + } > + > + additional_length_offset++; > + > + suffix += current_char; > + skip_input (); > + current_char = peek_input (); > + } > + > + if (suffix.empty ()) > + { > + // no type suffix: do nothing but also no error > + return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset); > + } > + else if (suffix == "f32") > + { > + return std::make_pair (CORETYPE_F32, additional_length_offset); > + } > + else if (suffix == "f64") > + { > + return std::make_pair (CORETYPE_F64, additional_length_offset); > + } > + else if (suffix == "i8") > + { > + return std::make_pair (CORETYPE_I8, additional_length_offset); > + } > + else if (suffix == "i16") > + { > + return std::make_pair (CORETYPE_I16, additional_length_offset); > + } > + else if (suffix == "i32") > + { > + return std::make_pair (CORETYPE_I32, additional_length_offset); > + } > + else if (suffix == "i64") > + { > + return std::make_pair (CORETYPE_I64, additional_length_offset); > + } > + else if (suffix == "i128") > + { > + return std::make_pair (CORETYPE_I128, additional_length_offset); > + } > + else if (suffix == "isize") > + { > + return std::make_pair (CORETYPE_ISIZE, additional_length_offset); > + } > + else if (suffix == "u8") > + { > + return std::make_pair (CORETYPE_U8, additional_length_offset); > + } > + else if (suffix == "u16") > + { > + return std::make_pair (CORETYPE_U16, additional_length_offset); > + } > + else if (suffix == "u32") > + { > + return std::make_pair (CORETYPE_U32, additional_length_offset); > + } > + else if (suffix == "u64") > + { > + return std::make_pair (CORETYPE_U64, additional_length_offset); > + } > + else if (suffix == "u128") > + { > + return std::make_pair (CORETYPE_U128, additional_length_offset); > + } > + else if (suffix == "usize") > + { > + return std::make_pair (CORETYPE_USIZE, additional_length_offset); > + } > + else > + { > + rust_error_at (get_current_location (), "unknown number suffix %qs", > + suffix.c_str ()); > + > + return std::make_pair (CORETYPE_UNKNOWN, additional_length_offset); > + } > +} > + > +// Parses in the exponent part (if any) of a float literal. > +std::pair > +Lexer::parse_in_exponent_part () > +{ > + int additional_length_offset = 0; > + std::string str; > + if (current_char == 'E' || current_char == 'e') > + { > + // add exponent to string as strtod works with it > + str += current_char; > + skip_input (); > + current_char = peek_input (); > + > + additional_length_offset++; > + > + // special - and + handling > + if (current_char == '-') > + { > + str += '-'; > + > + skip_input (); > + current_char = peek_input (); > + > + additional_length_offset++; > + } > + else if (current_char == '+') > + { > + // don't add + but still skip input > + skip_input (); > + current_char = peek_input (); > + > + additional_length_offset++; > + } > + > + // parse another decimal number for exponent > + auto str_length = parse_in_decimal (); > + str += std::get<0> (str_length); > + additional_length_offset += std::get<1> (str_length); > + } > + return std::make_pair (str, additional_length_offset); > +} > + > +// Parses a decimal integer. > +std::tuple > +Lexer::parse_in_decimal () > +{ > + /* A pure decimal contains only digits. */ > + bool pure_decimal = true; > + int additional_length_offset = 0; > + std::string str; > + while (ISDIGIT (current_char) || current_char == '_') > + { > + if (current_char == '_') > + { > + pure_decimal = false; > + // don't add _ to number > + skip_input (); > + current_char = peek_input (); > + > + additional_length_offset++; > + > + continue; > + } > + > + additional_length_offset++; > + > + str += current_char; > + skip_input (); > + current_char = peek_input (); > + } > + return std::make_tuple (str, additional_length_offset, pure_decimal); > +} > + > +/* Parses escapes (and string continues) in "byte" strings and characters. Does > + * not support unicode. */ > +std::tuple > +Lexer::parse_escape (char opening_char) > +{ > + int additional_length_offset = 0; > + char output_char = 0; > + > + // skip to actual letter > + skip_input (); > + current_char = peek_input (); > + additional_length_offset++; > + > + switch (current_char) > + { > + case 'x': { > + auto hex_escape_pair = parse_partial_hex_escape (); > + long hexLong = hex_escape_pair.first; > + additional_length_offset += hex_escape_pair.second; > + > + if (hexLong > 255 || hexLong < 0) > + rust_error_at ( > + get_current_location (), > + "byte \\x escape %<\\x%x%> out of range - allows up to %<\\xFF%>", > + static_cast (hexLong)); > + /* TODO: restore capital for escape output - gcc pretty-printer doesn't > + * support %X directly */ > + char hexChar = static_cast (hexLong); > + > + output_char = hexChar; > + } > + break; > + case 'n': > + output_char = '\n'; > + break; > + case 'r': > + output_char = '\r'; > + break; > + case 't': > + output_char = '\t'; > + break; > + case '\\': > + output_char = '\\'; > + break; > + case '0': > + output_char = '\0'; > + break; > + case '\'': > + output_char = '\''; > + break; > + case '"': > + output_char = '"'; > + break; > + case 'u': > + rust_error_at (get_current_location (), > + "cannot have a unicode escape \\u in a byte %s", > + opening_char == '\'' ? "character" : "string"); > + // Try to parse it anyway, just to skip it > + parse_partial_unicode_escape (); > + return std::make_tuple (output_char, additional_length_offset, false); > + case '\r': > + case '\n': > + // string continue > + return std::make_tuple (0, parse_partial_string_continue (), true); > + default: > + rust_error_at (get_current_location (), > + "unknown escape sequence %<\\%c%>", current_char); > + // returns false if no parsing could be done > + // return false; > + return std::make_tuple (output_char, additional_length_offset, false); > + break; > + } > + // all non-special cases (string continue) should skip their used char > + skip_input (); > + current_char = peek_input (); > + additional_length_offset++; > + > + // returns true if parsing was successful > + // return true; > + return std::make_tuple (output_char, additional_length_offset, false); > +} > + > +/* Parses an escape (or string continue) in a string or character. Supports > + * unicode escapes. */ > +std::tuple > +Lexer::parse_utf8_escape (char opening_char) > +{ > + Codepoint output_char; > + int additional_length_offset = 0; > + > + // skip to actual letter > + skip_input (); > + current_char = peek_input (); > + additional_length_offset++; > + > + switch (current_char) > + { > + case 'x': { > + auto hex_escape_pair = parse_partial_hex_escape (); > + long hexLong = hex_escape_pair.first; > + additional_length_offset += hex_escape_pair.second; > + > + if (hexLong > 127 || hexLong < 0) > + rust_error_at ( > + get_current_location (), > + "ascii \\x escape %<\\x%x%> out of range - allows up to %<\\x7F%>", > + static_cast (hexLong)); > + /* TODO: restore capital for escape output - gcc pretty-printer doesn't > + * support %X directly */ > + char hexChar = static_cast (hexLong); > + > + output_char = hexChar; > + } > + break; > + case 'n': > + output_char = '\n'; > + break; > + case 'r': > + output_char = '\r'; > + break; > + case 't': > + output_char = '\t'; > + break; > + case '\\': > + output_char = '\\'; > + break; > + case '0': > + output_char = '\0'; > + break; > + case '\'': > + output_char = '\''; > + break; > + case '"': > + output_char = '"'; > + break; > + case 'u': { > + auto unicode_escape_pair = parse_partial_unicode_escape (); > + output_char = unicode_escape_pair.first; > + additional_length_offset += unicode_escape_pair.second; > + > + return std::make_tuple (output_char, additional_length_offset, false); > + } > + break; > + case '\r': > + case '\n': > + // string continue > + return std::make_tuple (0, parse_partial_string_continue (), true); > + default: > + rust_error_at (get_current_location (), > + "unknown escape sequence %<\\%c%>", current_char); > + // returns false if no parsing could be done > + // return false; > + return std::make_tuple (output_char, additional_length_offset, false); > + break; > + } > + /* all non-special cases (unicode, string continue) should skip their used > + * char */ > + skip_input (); > + current_char = peek_input (); > + additional_length_offset++; > + > + // returns true if parsing was successful > + // return true; > + return std::make_tuple (output_char, additional_length_offset, false); > +} > + > +// Parses the body of a string continue that has been found in an escape. > +int > +Lexer::parse_partial_string_continue () > +{ > + int additional_length_offset = 1; > + > + // string continue > + while (is_whitespace (current_char)) > + { > + if (current_char == '\n') > + { > + current_line++; > + current_column = 1; > + // tell line_table that new line starts > + start_line (current_line, max_column_hint); > + > + // reset "length" > + additional_length_offset = 1; > + > + // get next char > + skip_input (); > + current_char = peek_input (); > + > + continue; > + } > + > + skip_input (); > + current_char = peek_input (); > + additional_length_offset++; > + } > + > + return additional_length_offset; > +} > + > +/* Parses the body of a '\x' escape. Note that it does not check that the number > + * is valid and smaller than 255. */ > +std::pair > +Lexer::parse_partial_hex_escape () > +{ > + // hex char string (null-terminated) > + char hexNum[3] = {0, 0, 0}; > + > + // first hex char > + current_char = peek_input (1); > + int additional_length_offset = 1; > + > + if (!is_x_digit (current_char)) > + { > + rust_error_at (get_current_location (), > + "invalid character %<\\x%c%> in \\x sequence", > + current_char); > + return std::make_pair (0, 0); > + } > + hexNum[0] = current_char; > + > + // second hex char > + skip_input (); > + current_char = peek_input (1); > + additional_length_offset++; > + > + if (!is_x_digit (current_char)) > + { > + rust_error_at (get_current_location (), > + "invalid character %<\\x%c%c%> in \\x sequence", hexNum[0], > + current_char); > + return std::make_pair (0, 1); > + } > + skip_input (); > + hexNum[1] = current_char; > + > + long hexLong = std::strtol (hexNum, nullptr, 16); > + > + return std::make_pair (hexLong, additional_length_offset); > +} > + > +// Parses the body of a unicode escape. > +std::pair > +Lexer::parse_partial_unicode_escape () > +{ > + skip_input (); > + current_char = peek_input (); > + int additional_length_offset = 0; > + > + if (current_char != '{') > + { > + rust_error_at (get_current_location (), > + "unicode escape should start with %<{%>"); > + /* Skip what should probaby have been between brackets. */ > + while (is_x_digit (current_char) || current_char == '_') > + { > + skip_input (); > + current_char = peek_input (); > + additional_length_offset++; > + } > + return std::make_pair (Codepoint (0), additional_length_offset); > + } > + > + skip_input (); > + current_char = peek_input (); > + additional_length_offset++; > + > + if (current_char == '_') > + { > + rust_error_at (get_current_location (), > + "unicode escape cannot start with %<_%>"); > + skip_input (); > + current_char = peek_input (); > + additional_length_offset++; > + // fallthrough and try to parse the rest anyway > + } > + > + // parse unicode escape - 1-6 hex digits > + std::string num_str; > + num_str.reserve (6); > + > + // loop through to add entire hex number to string > + while (is_x_digit (current_char) || current_char == '_') > + { > + if (current_char == '_') > + { > + // don't add _ to number > + skip_input (); > + current_char = peek_input (); > + > + additional_length_offset++; > + > + continue; > + } > + > + additional_length_offset++; > + > + // add raw hex numbers > + num_str += current_char; > + > + skip_input (); > + current_char = peek_input (); > + } > + > + if (current_char == '}') > + { > + skip_input (); > + current_char = peek_input (); > + additional_length_offset++; > + } > + else > + { > + // actually an error, but allow propagation anyway Assume that > + // wrong bracketm whitespace or single/double quotes are wrong > + // termination, otherwise it is a wrong character, then skip to the actual > + // terminator. > + if (current_char == '{' || is_whitespace (current_char) > + || current_char == '\'' || current_char == '"') > + { > + rust_error_at (get_current_location (), > + "expected terminating %<}%> in unicode escape"); > + return std::make_pair (Codepoint (0), additional_length_offset); > + } > + else > + { > + rust_error_at (get_current_location (), > + "invalid character %<%c%> in unicode escape", > + current_char); > + while (current_char != '}' && current_char != '{' > + && !is_whitespace (current_char) && current_char != '\'' > + && current_char != '"') > + { > + skip_input (); > + current_char = peek_input (); > + additional_length_offset++; > + } > + // Consume the actual closing bracket if found > + if (current_char == '}') > + { > + skip_input (); > + current_char = peek_input (); > + additional_length_offset++; > + } > + return std::make_pair (Codepoint (0), additional_length_offset); > + } > + } > + > + // ensure 1-6 hex characters > + if (num_str.length () > 6 || num_str.length () < 1) > + { > + rust_error_at (get_current_location (), > + "unicode escape should be between 1 and 6 hex " > + "characters; it is %lu", > + (unsigned long) num_str.length ()); > + // return false; > + return std::make_pair (Codepoint (0), additional_length_offset); > + } > + > + unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16); > + > + if (hex_num > 0xd7ff && hex_num < 0xe000) > + { > + rust_error_at ( > + get_current_location (), > + "unicode escape cannot be a surrogate value (D800 to DFFF)"); > + return std::make_pair (Codepoint (0), additional_length_offset); > + } > + > + if (hex_num > 0x10ffff) > + { > + rust_error_at (get_current_location (), > + "unicode escape cannot be larger than 10FFFF"); > + return std::make_pair (Codepoint (0), additional_length_offset); > + } > + > + // return true; > + return std::make_pair (Codepoint (static_cast (hex_num)), > + additional_length_offset); > +} > + > +// Parses a byte character. > +TokenPtr > +Lexer::parse_byte_char (Location loc) > +{ > + skip_input (); > + current_column++; > + // make current char the next character > + current_char = peek_input (); > + > + int length = 1; > + > + // char to save > + char byte_char = 0; > + > + // detect escapes > + if (current_char == '\\') > + { > + auto escape_length_pair = parse_escape ('\''); > + byte_char = std::get<0> (escape_length_pair); > + length += std::get<1> (escape_length_pair); > + > + current_char = peek_input (); > + > + if (current_char != '\'') > + { > + rust_error_at (get_current_location (), "unclosed %"); > + } > + > + skip_input (); > + current_char = peek_input (); > + length++; // go to next char > + } > + else if (current_char != '\'') > + { > + // otherwise, get character from direct input character > + byte_char = current_char; > + > + skip_input (); > + current_char = peek_input (); > + length++; > + > + if (current_char != '\'') > + { > + rust_error_at (get_current_location (), "unclosed %"); > + } > + > + skip_input (); > + current_char = peek_input (); > + length++; // go to next char > + } > + else > + { > + rust_error_at (get_current_location (), > + "no character inside %<%> for %"); > + } > + > + current_column += length; > + > + return Token::make_byte_char (loc, byte_char); > +} > + > +// Parses a byte string. > +TokenPtr > +Lexer::parse_byte_string (Location loc) > +{ > + // byte string > + > + // skip quote character > + skip_input (); > + current_column++; > + > + std::string str; > + str.reserve (16); // some sensible default > + > + int length = 1; > + current_char = peek_input (); > + > + while (current_char != '"' && current_char != EOF) > + { > + if (current_char == '\\') > + { > + auto escape_length_pair = parse_escape ('"'); > + char output_char = std::get<0> (escape_length_pair); > + > + if (output_char == 0 && std::get<2> (escape_length_pair)) > + length = std::get<1> (escape_length_pair) - 1; > + else > + length += std::get<1> (escape_length_pair); > + > + if (output_char != 0 || !std::get<2> (escape_length_pair)) > + str += output_char; > + > + continue; > + } > + > + length++; > + > + str += current_char; > + skip_input (); > + current_char = peek_input (); > + } > + > + current_column += length; > + > + if (current_char == '"') > + { > + current_column++; > + > + skip_input (); > + current_char = peek_input (); > + } > + else if (current_char == EOF) > + { > + rust_error_at (get_current_location (), "unended byte string literal"); > + return Token::make (END_OF_FILE, get_current_location ()); > + } > + else > + { > + gcc_unreachable (); > + } > + > + str.shrink_to_fit (); > + > + return Token::make_byte_string (loc, std::move (str)); > +} > + > +// Parses a raw byte string. > +TokenPtr > +Lexer::parse_raw_byte_string (Location loc) > +{ > + // raw byte string literals > + std::string str; > + str.reserve (16); // some sensible default > + > + int length = 1; > + int hash_count = 0; > + > + // get hash count at beginnning > + skip_input (); > + current_char = peek_input (); > + length++; > + while (current_char == '#') > + { > + hash_count++; > + length++; > + > + skip_input (); > + current_char = peek_input (); > + } > + > + if (current_char != '"') > + { > + rust_error_at (get_current_location (), > + "raw byte string has no opening %<\"%>"); > + } > + > + skip_input (); > + current_char = peek_input (); > + length++; > + > + while (true) > + { > + if (current_char == '"') > + { > + bool enough_hashes = true; > + > + for (int i = 0; i < hash_count; i++) > + { > + if (peek_input (i + 1) != '#') > + { > + enough_hashes = false; > + break; > + } > + } > + > + if (enough_hashes) > + { > + // skip enough input and peek enough input > + skip_input (hash_count); > + current_char = peek_input (); > + length += hash_count + 1; > + break; > + } > + } > + > + if ((unsigned char) current_char > 127) > + { > + rust_error_at (get_current_location (), > + "character %<%c%> in raw byte string out of range", > + current_char); > + current_char = 0; > + } > + > + length++; > + > + str += current_char; > + skip_input (); > + current_char = peek_input (); > + } > + > + current_column += length; > + > + str.shrink_to_fit (); > + > + return Token::make_byte_string (loc, std::move (str)); > +} > + > +// Parses a raw identifier. > +TokenPtr > +Lexer::parse_raw_identifier (Location loc) > +{ > + // raw identifier > + std::string str; > + str.reserve (16); // default > + > + skip_input (); > + current_char = peek_input (); > + > + current_column += 2; > + > + bool first_is_underscore = current_char == '_'; > + > + int length = 0; > + current_char = peek_input (); > + // loop through entire name > + while (ISALPHA (current_char) || ISDIGIT (current_char) > + || current_char == '_') > + { > + length++; > + > + str += current_char; > + skip_input (); > + current_char = peek_input (); > + } > + > + current_column += length; > + > + // if just a single underscore, not an identifier > + if (first_is_underscore && length == 1) > + rust_error_at (get_current_location (), > + "%<_%> is not a valid raw identifier"); > + > + if (str == "crate" || str == "extern" || str == "self" || str == "super" > + || str == "Self") > + { > + rust_error_at (get_current_location (), > + "%qs is a forbidden raw identifier", str.c_str ()); > + > + return nullptr; > + } > + else > + { > + str.shrink_to_fit (); > + > + return Token::make_identifier (loc, std::move (str)); > + } > +} > + > +// skip broken string input (unterminated strings) > +void > +Lexer::skip_broken_string_input (int current_char) > +{ > + while (current_char != '"' && current_char != EOF) > + { > + if (current_char == '\n') > + { > + current_line++; > + current_column = 1; > + } > + else > + { > + current_column++; > + } > + skip_input (); > + current_char = peek_input (); > + } > + if (current_char == '"') > + { > + current_column++; > + > + skip_input (); > + current_char = peek_input (); > + } > + rust_debug ("skipped to %d:%d due to bad quotes", current_line, > + current_column); > +} > + > +// Parses a unicode string. > +TokenPtr > +Lexer::parse_string (Location loc) > +{ > + Codepoint current_char32; > + > + std::string str; > + str.reserve (16); // some sensible default > + > + int length = 1; > + current_char32 = peek_codepoint_input (); > + > + // FIXME: This fails if the input ends. How do we check for EOF? > + while (current_char32.value != '"' && !current_char32.is_eof ()) > + { > + if (current_char32.value == '\\') > + { > + // parse escape > + auto utf8_escape_pair = parse_utf8_escape ('\''); > + current_char32 = std::get<0> (utf8_escape_pair); > + > + if (current_char32 == Codepoint (0) && std::get<2> (utf8_escape_pair)) > + length = std::get<1> (utf8_escape_pair) - 1; > + else > + length += std::get<1> (utf8_escape_pair); > + > + if (current_char32 != Codepoint (0) > + || !std::get<2> (utf8_escape_pair)) > + str += current_char32; > + > + // required as parsing utf8 escape only changes current_char > + current_char32 = peek_codepoint_input (); > + > + continue; > + } > + > + length += get_input_codepoint_length (); > + > + str += current_char32; > + skip_codepoint_input (); > + current_char32 = peek_codepoint_input (); > + } > + > + current_column += length; > + > + if (current_char32.value == '"') > + { > + current_column++; > + > + skip_input (); > + current_char = peek_input (); > + } > + else if (current_char32.is_eof ()) > + { > + rust_error_at (get_current_location (), "unended string literal"); > + return Token::make (END_OF_FILE, get_current_location ()); > + } > + else > + { > + gcc_unreachable (); > + } > + > + str.shrink_to_fit (); > + return Token::make_string (loc, std::move (str)); > +} > + > +// Parses an identifier or keyword. > +TokenPtr > +Lexer::parse_identifier_or_keyword (Location loc) > +{ > + std::string str; > + str.reserve (16); // default > + str += current_char; > + > + bool first_is_underscore = current_char == '_'; > + > + int length = 1; > + current_char = peek_input (); > + // loop through entire name > + while (ISALPHA (current_char) || ISDIGIT (current_char) > + || current_char == '_') > + { > + length++; > + > + str += current_char; > + skip_input (); > + current_char = peek_input (); > + } > + > + current_column += length; > + > + // if just a single underscore, not an identifier > + if (first_is_underscore && length == 1) > + return Token::make (UNDERSCORE, loc); > + > + str.shrink_to_fit (); > + > + TokenId keyword = classify_keyword (str); > + if (keyword == IDENTIFIER) > + return Token::make_identifier (loc, std::move (str)); > + else > + return Token::make (keyword, loc); > +} > + > +// Possibly returns a raw string token if it exists - otherwise returns null. > +TokenPtr > +Lexer::maybe_parse_raw_string (Location loc) > +{ > + int peek_index = 0; > + while (peek_input (peek_index) == '#') > + peek_index++; > + > + if (peek_input (peek_index) == '"') > + return parse_raw_string (loc, peek_index); > + else > + return nullptr; > +} > + > +// Returns a raw string token. > +TokenPtr > +Lexer::parse_raw_string (Location loc, int initial_hash_count) > +{ > + // raw string literals > + std::string str; > + str.reserve (16); // some sensible default > + > + int length = 1 + initial_hash_count; > + > + if (initial_hash_count > 0) > + skip_input (initial_hash_count - 1); > + > + current_char = peek_input (); > + > + if (current_char != '"') > + rust_error_at (get_current_location (), "raw string has no opening %<\"%>"); > + > + length++; > + skip_input (); > + Codepoint current_char32 = peek_codepoint_input (); > + > + while (!current_char32.is_eof ()) > + { > + if (current_char32.value == '"') > + { > + bool enough_hashes = true; > + > + for (int i = 0; i < initial_hash_count; i++) > + { > + if (peek_input (i + 1) != '#') > + { > + enough_hashes = false; > + break; > + } > + } > + > + if (enough_hashes) > + { > + // skip enough input and peek enough input > + skip_input (initial_hash_count); > + current_char = peek_input (); > + length += initial_hash_count + 1; > + break; > + } > + } > + > + length++; > + > + str += current_char32; > + skip_codepoint_input (); > + current_char32 = peek_codepoint_input (); > + } > + > + current_column += length; > + > + str.shrink_to_fit (); > + > + return Token::make_string (loc, std::move (str)); > +} > + > +template > +TokenPtr > +Lexer::parse_non_decimal_int_literal (Location loc, IsDigitFunc is_digit_func, > + std::string existent_str, int base) > +{ > + int length = 1; > + > + skip_input (); > + current_char = peek_input (); > + > + length++; > + > + // loop through to add entire number to string > + while (is_digit_func (current_char) || current_char == '_') > + { > + if (current_char == '_') > + { > + // don't add _ to number > + skip_input (); > + current_char = peek_input (); > + > + length++; > + > + continue; > + } > + > + length++; > + > + // add raw numbers > + existent_str += current_char; > + skip_input (); > + current_char = peek_input (); > + } > + > + // convert value to decimal representation > + long dec_num = std::strtol (existent_str.c_str (), nullptr, base); > + > + existent_str = std::to_string (dec_num); > + > + // parse in type suffix if it exists > + auto type_suffix_pair = parse_in_type_suffix (); > + PrimitiveCoreType type_hint = type_suffix_pair.first; > + length += type_suffix_pair.second; > + > + current_column += length; > + > + if (type_hint == CORETYPE_F32 || type_hint == CORETYPE_F64) > + { > + rust_error_at (get_current_location (), > + "invalid type suffix %qs for integer (%s) literal", > + get_type_hint_string (type_hint), > + base == 16 > + ? "hex" > + : (base == 8 ? "octal" > + : (base == 2 ? "binary" > + : ""))); > + return nullptr; > + } > + return Token::make_int (loc, std::move (existent_str), type_hint); > +} > + > +// Parses a hex, binary or octal int literal. > +TokenPtr > +Lexer::parse_non_decimal_int_literals (Location loc) > +{ > + std::string str; > + str.reserve (16); // some sensible default > + str += current_char; > + > + current_char = peek_input (); > + > + if (current_char == 'x') > + { > + // hex (integer only) > + return parse_non_decimal_int_literal (loc, is_x_digit, str + "x", 16); > + } > + else if (current_char == 'o') > + { > + // octal (integer only) > + return parse_non_decimal_int_literal (loc, is_octal_digit, > + std::move (str), 8); > + } > + else if (current_char == 'b') > + { > + // binary (integer only) > + return parse_non_decimal_int_literal (loc, is_bin_digit, std::move (str), > + 2); > + } > + else > + { > + return nullptr; > + } > +} > + > +// Parses a decimal-based int literal or float literal. > +TokenPtr > +Lexer::parse_decimal_int_or_float (Location loc) > +{ > + std::string str; > + str.reserve (16); // some sensible default > + str += current_char; > + > + int length = 1; > + bool first_zero = current_char == '0'; > + > + current_char = peek_input (); > + > + // parse initial decimal integer (or first integer part of float) literal > + auto initial_decimal = parse_in_decimal (); > + str += std::get<0> (initial_decimal); > + length += std::get<1> (initial_decimal); > + > + // detect float literal > + if (current_char == '.' && is_float_digit (peek_input (1))) > + { > + // float with a '.', parse another decimal into it > + > + // add . to str > + str += current_char; > + skip_input (); > + current_char = peek_input (); > + length++; > + > + // parse another decimal number for float > + auto second_decimal = parse_in_decimal (); > + str += std::get<0> (second_decimal); > + length += std::get<1> (second_decimal); > + > + // parse in exponent part if it exists > + auto exponent_pair = parse_in_exponent_part (); > + str += exponent_pair.first; > + length += exponent_pair.second; > + > + // parse in type suffix if it exists > + auto type_suffix_pair = parse_in_type_suffix (); > + PrimitiveCoreType type_hint = type_suffix_pair.first; > + length += type_suffix_pair.second; > + > + if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64 > + && type_hint != CORETYPE_UNKNOWN) > + { > + rust_error_at (get_current_location (), > + "invalid type suffix %qs for floating-point literal", > + get_type_hint_string (type_hint)); > + // ignore invalid type suffix as everything else seems fine > + type_hint = CORETYPE_UNKNOWN; > + } > + > + current_column += length; > + > + str.shrink_to_fit (); > + return Token::make_float (loc, std::move (str), type_hint); > + } > + else if (current_char == '.' && check_valid_float_dot_end (peek_input (1))) > + { > + // float that is just an integer with a terminating '.' character > + > + // add . to str > + str += current_char; > + skip_input (); > + current_char = peek_input (); > + length++; > + > + // add a '0' after the . to prevent ambiguity > + str += '0'; > + > + // type hint not allowed > + > + current_column += length; > + > + str.shrink_to_fit (); > + return Token::make_float (loc, std::move (str), CORETYPE_UNKNOWN); > + } > + else if (current_char == 'E' || current_char == 'e') > + { > + // exponent float with no '.' character > + > + // parse exponent part > + auto exponent_pair = parse_in_exponent_part (); > + str += exponent_pair.first; > + length += exponent_pair.second; > + > + // parse in type suffix if it exists > + auto type_suffix_pair = parse_in_type_suffix (); > + PrimitiveCoreType type_hint = type_suffix_pair.first; > + length += type_suffix_pair.second; > + > + if (type_hint != CORETYPE_F32 && type_hint != CORETYPE_F64 > + && type_hint != CORETYPE_UNKNOWN) > + { > + rust_error_at (get_current_location (), > + "invalid type suffix %qs for floating-point literal", > + get_type_hint_string (type_hint)); > + // ignore invalid type suffix as everything else seems fine > + type_hint = CORETYPE_UNKNOWN; > + } > + > + current_column += length; > + > + str.shrink_to_fit (); > + return Token::make_float (loc, std::move (str), type_hint); > + } > + else > + { > + // is an integer > + > + // parse in type suffix if it exists > + auto type_suffix_pair = parse_in_type_suffix (); > + PrimitiveCoreType type_hint = type_suffix_pair.first; > + /* A "real" pure decimal doesn't have a suffix and no zero prefix. */ > + if (type_hint == CORETYPE_UNKNOWN) > + { > + bool pure_decimal = std::get<2> (initial_decimal); > + if (pure_decimal && (!first_zero || str.size () == 1)) > + type_hint = CORETYPE_PURE_DECIMAL; > + } > + length += type_suffix_pair.second; > + > + current_column += length; > + > + str.shrink_to_fit (); > + return Token::make_int (loc, std::move (str), type_hint); > + } > +} > + > +TokenPtr > +Lexer::parse_char_or_lifetime (Location loc) > +{ > + Codepoint current_char32; > + > + int length = 1; > + > + current_char32 = peek_codepoint_input (); > + if (current_char32.is_eof ()) > + return nullptr; > + > + // parse escaped char literal > + if (current_char32.value == '\\') > + { > + // parse escape > + auto utf8_escape_pair = parse_utf8_escape ('\''); > + current_char32 = std::get<0> (utf8_escape_pair); > + length += std::get<1> (utf8_escape_pair); > + > + if (peek_codepoint_input ().value != '\'') > + { > + rust_error_at (get_current_location (), "unended character literal"); > + } > + else > + { > + skip_codepoint_input (); > + current_char = peek_input (); > + length++; > + } > + > + current_column += length; > + > + return Token::make_char (loc, current_char32); > + } > + else > + { > + skip_codepoint_input (); > + > + if (peek_codepoint_input ().value == '\'') > + { > + // parse non-escaped char literal > + > + // skip the ' character > + skip_input (); > + current_char = peek_input (); > + > + // TODO fix due to different widths of utf-8 chars? > + current_column += 3; > + > + return Token::make_char (loc, current_char32); > + } > + else if (ISDIGIT (current_char32.value) || ISALPHA (current_char32.value) > + || current_char32.value == '_') > + { > + // parse lifetime name > + std::string str; > + str += current_char32; > + length++; > + > + current_char = peek_input (); > + while (ISDIGIT (current_char) || ISALPHA (current_char) > + || current_char == '_') > + { > + str += current_char; > + skip_input (); > + current_char = peek_input (); > + length++; > + } > + > + current_column += length; > + > + str.shrink_to_fit (); > + return Token::make_lifetime (loc, std::move (str)); > + } > + else > + { > + rust_error_at ( > + get_current_location (), > + "expected %' after character constant in character literal"); > + return nullptr; > + } > + } > +} > + > +// Returns the length of the codepoint at the current position. > +int > +Lexer::get_input_codepoint_length () > +{ > + uint8_t input = peek_input (); > + > + if ((int8_t) input == EOF) > + return 0; > + > + if (input < 128) > + { > + // ascii -- 1 byte > + // return input; > + > + return 1; > + } > + else if ((input & 0xC0) == 0x80) > + { > + // invalid (continuation; can't be first char) > + // return 0xFFFE; > + > + return 0; > + } > + else if ((input & 0xE0) == 0xC0) > + { > + // 2 bytes > + uint8_t input2 = peek_input (1); > + if ((input2 & 0xC0) != 0x80) > + return 0; > + // return 0xFFFE; > + > + // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); > + // return output; > + return 2; > + } > + else if ((input & 0xF0) == 0xE0) > + { > + // 3 bytes > + uint8_t input2 = peek_input (1); > + if ((input2 & 0xC0) != 0x80) > + return 0; > + // return 0xFFFE; > + > + uint8_t input3 = peek_input (2); > + if ((input3 & 0xC0) != 0x80) > + return 0; > + // return 0xFFFE; > + > + /*uint32_t output > + = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << > + 0); return output;*/ > + return 3; > + } > + else if ((input & 0xF8) == 0xF0) > + { > + // 4 bytes > + uint8_t input2 = peek_input (1); > + if ((input2 & 0xC0) != 0x80) > + return 0; > + // return 0xFFFE; > + > + uint8_t input3 = peek_input (2); > + if ((input3 & 0xC0) != 0x80) > + return 0; > + // return 0xFFFE; > + > + uint8_t input4 = peek_input (3); > + if ((input4 & 0xC0) != 0x80) > + return 0; > + // return 0xFFFE; > + > + /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) > + | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); > + return output;*/ > + return 4; > + } > + else > + { > + rust_error_at (get_current_location (), > + "invalid UTF-8 [FIRST] (too long)"); > + return 0; > + } > +} > + > +// Returns the codepoint at the current position. > +Codepoint > +Lexer::peek_codepoint_input () > +{ > + uint8_t input = peek_input (); > + > + if ((int8_t) input == EOF) > + return Codepoint::eof (); > + > + if (input < 128) > + { > + // ascii -- 1 byte > + return {input}; > + } > + else if ((input & 0xC0) == 0x80) > + { > + // invalid (continuation; can't be first char) > + return {0xFFFE}; > + } > + else if ((input & 0xE0) == 0xC0) > + { > + // 2 bytes > + uint8_t input2 = peek_input (1); > + if ((input2 & 0xC0) != 0x80) > + return {0xFFFE}; > + > + uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); > + return {output}; > + } > + else if ((input & 0xF0) == 0xE0) > + { > + // 3 bytes > + uint8_t input2 = peek_input (1); > + if ((input2 & 0xC0) != 0x80) > + return {0xFFFE}; > + > + uint8_t input3 = peek_input (2); > + if ((input3 & 0xC0) != 0x80) > + return {0xFFFE}; > + > + uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) > + | ((input3 & 0x3F) << 0); > + return {output}; > + } > + else if ((input & 0xF8) == 0xF0) > + { > + // 4 bytes > + uint8_t input2 = peek_input (1); > + if ((input2 & 0xC0) != 0x80) > + return {0xFFFE}; > + > + uint8_t input3 = peek_input (2); > + if ((input3 & 0xC0) != 0x80) > + return {0xFFFE}; > + > + uint8_t input4 = peek_input (3); > + if ((input4 & 0xC0) != 0x80) > + return {0xFFFE}; > + > + uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) > + | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); > + return {output}; > + } > + else > + { > + rust_error_at (get_current_location (), > + "invalid UTF-8 [SECND] (too long)"); > + return {0xFFFE}; > + } > +} > + > +void > +Lexer::skip_codepoint_input () > +{ > + int toSkip = get_input_codepoint_length (); > + gcc_assert (toSkip >= 1); > + > + skip_input (toSkip - 1); > +} > + > +int > +Lexer::test_get_input_codepoint_n_length (int n_start_offset) > +{ > + uint8_t input = peek_input (n_start_offset); > + > + if (input < 128) > + { > + // ascii -- 1 byte > + // return input; > + return 1; > + } > + else if ((input & 0xC0) == 0x80) > + { > + // invalid (continuation; can't be first char) > + // return 0xFFFE; > + return 0; > + } > + else if ((input & 0xE0) == 0xC0) > + { > + // 2 bytes > + uint8_t input2 = peek_input (n_start_offset + 1); > + if ((input2 & 0xC0) != 0x80) > + // return 0xFFFE; > + return 0; > + > + // uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); > + // return output; > + return 2; > + } > + else if ((input & 0xF0) == 0xE0) > + { > + // 3 bytes > + uint8_t input2 = peek_input (n_start_offset + 1); > + if ((input2 & 0xC0) != 0x80) > + // return 0xFFFE; > + return 0; > + > + uint8_t input3 = peek_input (n_start_offset + 2); > + if ((input3 & 0xC0) != 0x80) > + // return 0xFFFE; > + return 0; > + > + /*uint32_t output > + = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << > + 0); return output;*/ > + return 3; > + } > + else if ((input & 0xF8) == 0xF0) > + { > + // 4 bytes > + uint8_t input2 = peek_input (n_start_offset + 1); > + if ((input2 & 0xC0) != 0x80) > + // return 0xFFFE; > + return 0; > + > + uint8_t input3 = peek_input (n_start_offset + 2); > + if ((input3 & 0xC0) != 0x80) > + // return 0xFFFE; > + return 0; > + > + uint8_t input4 = peek_input (n_start_offset + 3); > + if ((input4 & 0xC0) != 0x80) > + // return 0xFFFE; > + return 0; > + > + /*uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) > + | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); > + return output;*/ > + return 4; > + } > + else > + { > + rust_error_at (get_current_location (), > + "invalid UTF-8 [THIRD] (too long)"); > + return 0; > + } > +} > + > +// peeks the codepoint input at n codepoints ahead of current codepoint - try > +// not to use > +Codepoint > +Lexer::test_peek_codepoint_input (int n) > +{ > + int totalOffset = 0; > + > + // add up all offsets into total offset? does this do what I want? > + for (int i = 0; i < n; i++) > + { > + totalOffset += test_get_input_codepoint_n_length (totalOffset); > + } > + // issues: this would have (at least) O(n) lookup time, not O(1) like the > + // rest? > + > + // TODO: implement if still needed > + > + // error out of function as it is not implemented > + gcc_assert (1 == 0); > + return {0}; > + /* > + uint8_t input = peek_input(); > + > + if (input < 128) { > + // ascii -- 1 byte > + return input; > + } else if ((input & 0xC0) == 0x80) { > + // invalid (continuation; can't be first char) > + return 0xFFFE; > + } else if ((input & 0xE0) == 0xC0) { > + // 2 bytes > + uint8_t input2 = peek_input(1); > + if ((input2 & 0xC0) != 0x80) > + return 0xFFFE; > + > + uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); > + return output; > + } else if ((input & 0xF0) == 0xE0) { > + // 3 bytes > + uint8_t input2 = peek_input(1); > + if ((input2 & 0xC0) != 0x80) > + return 0xFFFE; > + > + uint8_t input3 = peek_input(2); > + if ((input3 & 0xC0) != 0x80) > + return 0xFFFE; > + > + uint32_t output > + = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & > + 0x3F) << 0); return output; } else if ((input & 0xF8) == 0xF0) { > + // 4 bytes > + uint8_t input2 = peek_input(1); > + if ((input2 & 0xC0) != 0x80) > + return 0xFFFE; > + > + uint8_t input3 = peek_input(2); > + if ((input3 & 0xC0) != 0x80) > + return 0xFFFE; > + > + uint8_t input4 = peek_input(3); > + if ((input4 & 0xC0) != 0x80) > + return 0xFFFE; > + > + uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) > + | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << > + 0); return output; } else { rust_error_at(get_current_location(), "invalid > + UTF-8 (too long)"); return 0xFFFE; > + }*/ > +} > + > +void > +Lexer::split_current_token (TokenId new_left, TokenId new_right) > +{ > + /* TODO: assert that this TokenId is a "simple token" like punctuation and not > + * like "IDENTIFIER"? */ > + Location current_loc = peek_token ()->get_locus (); > + TokenPtr new_left_tok = Token::make (new_left, current_loc); > + TokenPtr new_right_tok = Token::make (new_right, current_loc + 1); > + > + token_queue.replace_current_value (std::move (new_left_tok)); > + token_queue.insert (1, std::move (new_right_tok)); > +} > + > +void > +Lexer::start_line (int current_line, int current_column) > +{ > + if (line_map) > + line_map->start_line (current_line, current_column); > +} > + > +} // namespace Rust > diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h > new file mode 100644 > index 00000000000..d5a6c53719f > --- /dev/null > +++ b/gcc/rust/lex/rust-lex.h > @@ -0,0 +1,271 @@ > +// Copyright (C) 2020-2022 Free Software Foundation, Inc. > + > +// This file is part of GCC. > + > +// GCC is free software; you can redistribute it and/or modify it under > +// the terms of the GNU General Public License as published by the Free > +// Software Foundation; either version 3, or (at your option) any later > +// version. > + > +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY > +// WARRANTY; without even the implied warranty of MERCHANTABILITY or > +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License > +// for more details. > + > +// You should have received a copy of the GNU General Public License > +// along with GCC; see the file COPYING3. If not see > +// . > + > +#ifndef RUST_LEX_H > +#define RUST_LEX_H > + > +#include "rust-linemap.h" > +#include "rust-buffered-queue.h" > +#include "rust-token.h" > + > +namespace Rust { > +// Simple wrapper for FILE* that simplifies destruction. > +struct RAIIFile > +{ > +private: > + FILE *file; > + const char *filename; > + > + void close () > + { > + if (file != nullptr && file != stdin) > + fclose (file); > + } > + > +public: > + RAIIFile (const char *filename) : filename (filename) > + { > + if (strcmp (filename, "-") == 0) > + file = stdin; > + else > + file = fopen (filename, "r"); > + } > + > + /** > + * Create a RAIIFile from an existing instance of FILE* > + */ > + RAIIFile (FILE *raw, const char *filename = nullptr) > + : file (raw), filename (filename) > + {} > + > + RAIIFile (const RAIIFile &other) = delete; > + RAIIFile &operator= (const RAIIFile &other) = delete; > + > + // have to specify setting file to nullptr, otherwise unintended fclose occurs > + RAIIFile (RAIIFile &&other) : file (other.file), filename (other.filename) > + { > + other.file = nullptr; > + } > + > + RAIIFile &operator= (RAIIFile &&other) > + { > + close (); > + file = other.file; > + filename = other.filename; > + other.file = nullptr; > + > + return *this; > + } > + > + static RAIIFile create_error () { return RAIIFile (nullptr, nullptr); } > + > + ~RAIIFile () { close (); } > + > + FILE *get_raw () { return file; } > + const char *get_filename () { return filename; } > + > + bool ok () const { return file; } > +}; > + > +class Lexer > +{ > +private: > + // Request new Location for current column in line_table > + Location get_current_location (); > + > + // Skips the current input char. > + void skip_input (); > + // Advances current input char to n + 1 chars ahead of current position. > + void skip_input (int n); > + > + // Returns char n chars ahead of current position. > + int peek_input (); > + // Peeks the current char. > + int peek_input (int n); > + > + // Classifies keyword (i.e. gets id for keyword). > + TokenId classify_keyword (const std::string &str); > + > + // Builds a token from the input queue. > + TokenPtr build_token (); > + > + std::tuple parse_in_decimal (); > + std::pair parse_in_exponent_part (); > + std::pair parse_in_type_suffix (); > + std::tuple parse_escape (char opening_char); > + std::tuple parse_utf8_escape (char opening_char); > + int parse_partial_string_continue (); > + std::pair parse_partial_hex_escape (); > + std::pair parse_partial_unicode_escape (); > + > + int get_input_codepoint_length (); > + int test_get_input_codepoint_n_length (int n_start_offset); > + Codepoint peek_codepoint_input (); > + Codepoint test_peek_codepoint_input (int n); > + void skip_codepoint_input (); > + void skip_broken_string_input (int current_char); > + > + TokenPtr parse_byte_char (Location loc); > + TokenPtr parse_byte_string (Location loc); > + TokenPtr parse_raw_byte_string (Location loc); > + TokenPtr parse_raw_identifier (Location loc); > + TokenPtr parse_string (Location loc); > + TokenPtr maybe_parse_raw_string (Location loc); > + TokenPtr parse_raw_string (Location loc, int initial_hash_count); > + TokenPtr parse_non_decimal_int_literals (Location loc); > + TokenPtr parse_decimal_int_or_float (Location loc); > + TokenPtr parse_char_or_lifetime (Location loc); > + TokenPtr parse_identifier_or_keyword (Location loc); > + > + template > + TokenPtr parse_non_decimal_int_literal (Location loc, > + IsDigitFunc is_digit_func, > + std::string existent_str, int base); > + > +public: > + // Construct lexer with input file and filename provided > + Lexer (const char *filename, RAIIFile input, Linemap *linemap); > + > + // Lex the contents of a string instead of a file > + Lexer (const std::string &input); > + > + // dtor > + ~Lexer (); > + > + // don't allow copy semantics (for now, at least) > + Lexer (const Lexer &other) = delete; > + Lexer &operator= (const Lexer &other) = delete; > + > + // enable move semantics > + Lexer (Lexer &&other) = default; > + Lexer &operator= (Lexer &&other) = default; > + > + // Returns token n tokens ahead of current position. > + const_TokenPtr peek_token (int n) { return token_queue.peek (n); } > + // Peeks the current token. > + const_TokenPtr peek_token () { return peek_token (0); } > + > + // Advances current token to n + 1 tokens ahead of current position. > + void skip_token (int n) { token_queue.skip (n); } > + // Skips the current token. > + void skip_token () { skip_token (0); } > + > + // Replaces the current token with a specified token. > + void replace_current_token (TokenPtr replacement); > + // FIXME: don't use anymore > + > + /* Splits the current token into two. Intended for use with nested generics > + * closes (i.e. T> where >> is wrongly lexed as one token). Note that > + * this will only work with "simple" tokens like punctuation. */ > + void split_current_token (TokenId new_left, TokenId new_right); > + > + Linemap *get_line_map () { return line_map; } > + std::string get_filename () { return std::string (input.get_filename ()); } > + > +private: > + void start_line (int current_line, int current_column); > + > + // File for use as input. > + RAIIFile input; > + // TODO is this actually required? could just have file storage in InputSource > + > + // Current line number. > + int current_line; > + // Current column number. > + int current_column; > + // Current character. > + int current_char; > + // Line map. > + Linemap *line_map; > + > + /* Max column number that can be quickly allocated - higher may require > + * allocating new linemap */ > + static const int max_column_hint = 80; > + > + // Input source wrapper thing. > + class InputSource > + { > + public: > + virtual ~InputSource () {} > + > + // Overload operator () to return next char from input stream. > + virtual int next () = 0; > + }; > + > + class FileInputSource : public InputSource > + { > + private: > + // Input source file. > + FILE *input; > + > + public: > + // Create new input source from file. > + FileInputSource (FILE *input) : input (input) {} > + > + int next () override { return fgetc (input); } > + }; > + > + class BufferInputSource : public InputSource > + { > + private: > + const std::string &buffer; > + size_t offs; > + > + public: > + // Create new input source from file. > + BufferInputSource (const std::string &b, size_t offset) > + : buffer (b), offs (offset) > + {} > + > + int next () override > + { > + if (offs >= buffer.size ()) > + return EOF; > + > + return buffer.at (offs++); > + } > + }; > + > + // The input source for the lexer. > + // InputSource input_source; > + // Input file queue. > + std::unique_ptr raw_input_source; > + buffered_queue input_queue; > + > + // Token source wrapper thing. > + struct TokenSource > + { > + // The lexer object that will use this TokenSource. > + Lexer *lexer; > + > + // Create a new TokenSource with given lexer. > + TokenSource (Lexer *parLexer) : lexer (parLexer) {} > + > + // Overload operator () to build token in lexer. > + TokenPtr next () { return lexer->build_token (); } > + }; > + > + // The token source for the lexer. > + // TokenSource token_source; > + // Token stream queue. > + buffered_queue, TokenSource> token_queue; > +}; > + > +} // namespace Rust > + > +#endif > diff --git a/gcc/rust/lex/rust-token.cc b/gcc/rust/lex/rust-token.cc > new file mode 100644 > index 00000000000..68313c20b1c > --- /dev/null > +++ b/gcc/rust/lex/rust-token.cc > @@ -0,0 +1,135 @@ > +// Copyright (C) 2020-2022 Free Software Foundation, Inc. > + > +// This file is part of GCC. > + > +// GCC is free software; you can redistribute it and/or modify it under > +// the terms of the GNU General Public License as published by the Free > +// Software Foundation; either version 3, or (at your option) any later > +// version. > + > +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY > +// WARRANTY; without even the implied warranty of MERCHANTABILITY or > +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License > +// for more details. > + > +// You should have received a copy of the GNU General Public License > +// along with GCC; see the file COPYING3. If not see > +// . > + > +#include "rust-token.h" > + > +#include "rust-diagnostics.h" // for error_at > + > +namespace Rust { > +// Hackily defined way to get token description for enum value using x-macros > +const char * > +get_token_description (TokenId id) > +{ > + switch (id) > + { > +#define RS_TOKEN(name, descr) \ > + case name: \ > + return descr; > +#define RS_TOKEN_KEYWORD(x, y) RS_TOKEN (x, y) > + RS_TOKEN_LIST > +#undef RS_TOKEN_KEYWORD > +#undef RS_TOKEN > + default: > + gcc_unreachable (); > + } > +} > + > +/* Hackily defined way to get token description as a string for enum value using > + * x-macros */ > +const char * > +token_id_to_str (TokenId id) > +{ > + switch (id) > + { > +#define RS_TOKEN(name, _) \ > + case name: \ > + return #name; > +#define RS_TOKEN_KEYWORD(x, y) RS_TOKEN (x, y) > + RS_TOKEN_LIST > +#undef RS_TOKEN_KEYWORD > +#undef RS_TOKEN > + default: > + gcc_unreachable (); > + } > +} > + > +const char * > +get_type_hint_string (PrimitiveCoreType type) > +{ > + switch (type) > + { > + case CORETYPE_BOOL: > + return "bool"; > + case CORETYPE_CHAR: > + return "char"; > + case CORETYPE_STR: > + return "str"; > + // case CORETYPE_INT: > + case CORETYPE_ISIZE: > + return "isize"; > + // case CORETYPE_UINT: > + case CORETYPE_USIZE: > + return "usize"; > + case CORETYPE_F32: > + return "f32"; > + case CORETYPE_F64: > + return "f64"; > + case CORETYPE_I8: > + return "i8"; > + case CORETYPE_I16: > + return "i16"; > + case CORETYPE_I32: > + return "i32"; > + case CORETYPE_I64: > + return "i64"; > + case CORETYPE_I128: > + return "i128"; > + case CORETYPE_U8: > + return "u8"; > + case CORETYPE_U16: > + return "u16"; > + case CORETYPE_U32: > + return "u32"; > + case CORETYPE_U64: > + return "u64"; > + case CORETYPE_U128: > + return "u128"; > + case CORETYPE_PURE_DECIMAL: > + return "pure_decimal"; > + case CORETYPE_UNKNOWN: > + default: > + return "unknown"; > + } > +} > + > +const char * > +Token::get_type_hint_str () const > +{ > + return get_type_hint_string (type_hint); > +} > + > +const std::string & > +Token::get_str () const > +{ > + // FIXME: attempt to return null again > + // gcc_assert(str != NULL); > + > + // HACK: allow referencing an empty string > + static const std::string empty = ""; > + > + if (str == NULL) > + { > + rust_error_at (get_locus (), > + "attempted to get string for %<%s%>, which has no string. " > + "returning empty string instead", > + get_token_description ()); > + return empty; > + } > + return *str; > +} > +} // namespace Rust > diff --git a/gcc/rust/lex/rust-token.h b/gcc/rust/lex/rust-token.h > new file mode 100644 > index 00000000000..3fa46a2cebe > --- /dev/null > +++ b/gcc/rust/lex/rust-token.h > @@ -0,0 +1,455 @@ > +// Copyright (C) 2020-2022 Free Software Foundation, Inc. > + > +// This file is part of GCC. > + > +// GCC is free software; you can redistribute it and/or modify it under > +// the terms of the GNU General Public License as published by the Free > +// Software Foundation; either version 3, or (at your option) any later > +// version. > + > +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY > +// WARRANTY; without even the implied warranty of MERCHANTABILITY or > +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License > +// for more details. > + > +// You should have received a copy of the GNU General Public License > +// along with GCC; see the file COPYING3. If not see > +// . > + > +#ifndef RUST_TOKEN_H > +#define RUST_TOKEN_H > + > +#include "rust-linemap.h" > +#include "rust-codepoint.h" > + > +// order: config, system, coretypes, input > +#include "config.h" > +#include "system.h" > +#include "coretypes.h" > +#include "input.h" > + > +namespace Rust { > +// "Primitive core types" in Rust - the different int and float types, as well > +// as some others > +enum PrimitiveCoreType > +{ > + CORETYPE_UNKNOWN, > + // named primitives > + CORETYPE_BOOL, > + CORETYPE_CHAR, > + CORETYPE_STR, > + // okay technically int and uint are arch-dependent (pointer size) > + CORETYPE_INT, > + CORETYPE_UINT, > + // numbered number primitives > + CORETYPE_F32, > + CORETYPE_F64, > + CORETYPE_I8, > + CORETYPE_I16, > + CORETYPE_I32, > + CORETYPE_I64, > + CORETYPE_I128, > + CORETYPE_U8, > + CORETYPE_U16, > + CORETYPE_U32, > + CORETYPE_U64, > + CORETYPE_U128, > + // Pure decimals are used for tuple index. > + // Also means there is no type hint. > + CORETYPE_PURE_DECIMAL, > + // arch-dependent pointer sizes > + CORETYPE_ISIZE = CORETYPE_INT, > + CORETYPE_USIZE = CORETYPE_UINT > +}; > + > +// RS_TOKEN(name, description) > +// RS_TOKEN_KEYWORD(name, identifier) > +// > +// Keep RS_TOKEN_KEYWORD sorted > + > +/* note that abstract, async, become, box, do, final, macro, override, priv, > + * try, typeof, unsized, virtual, and yield are unused */ > +#define RS_TOKEN_LIST \ > + RS_TOKEN (FIRST_TOKEN, "") \ > + RS_TOKEN (END_OF_FILE, "end of file") \ > + RS_TOKEN (EXCLAM, "!") \ > + RS_TOKEN (NOT_EQUAL, "!=") \ > + RS_TOKEN (PERCENT, "%") \ > + RS_TOKEN (PERCENT_EQ, "%=") \ > + RS_TOKEN (AMP, "&") \ > + RS_TOKEN (AMP_EQ, "&=") \ > + RS_TOKEN (LOGICAL_AND, "&&") \ > + RS_TOKEN (ASTERISK, "*") \ > + RS_TOKEN (ASTERISK_EQ, "*=") \ > + RS_TOKEN (PLUS, "+") \ > + RS_TOKEN (PLUS_EQ, "+=") \ > + RS_TOKEN (COMMA, ",") \ > + RS_TOKEN (MINUS, "-") \ > + RS_TOKEN (MINUS_EQ, "-=") \ > + RS_TOKEN (RETURN_TYPE, "->") \ > + RS_TOKEN (DOT, ".") \ > + RS_TOKEN (DOT_DOT, "..") \ > + RS_TOKEN (DOT_DOT_EQ, "..=") \ > + RS_TOKEN (ELLIPSIS, "...") \ > + RS_TOKEN (DIV, "/") \ > + RS_TOKEN (DIV_EQ, "/=") \ > + RS_TOKEN (COLON, ":") \ > + RS_TOKEN (SEMICOLON, ";") \ > + RS_TOKEN (LEFT_SHIFT, "<<") \ > + RS_TOKEN (LEFT_SHIFT_EQ, "<<=") \ > + RS_TOKEN (LEFT_ANGLE, "<") \ > + RS_TOKEN (LESS_OR_EQUAL, "<=") \ > + RS_TOKEN (EQUAL, "=") \ > + RS_TOKEN (EQUAL_EQUAL, "==") \ > + RS_TOKEN (MATCH_ARROW, "=>") \ > + RS_TOKEN (RIGHT_ANGLE, ">") \ > + RS_TOKEN (GREATER_OR_EQUAL, ">=") \ > + RS_TOKEN (RIGHT_SHIFT, ">>") \ > + RS_TOKEN (RIGHT_SHIFT_EQ, ">>=") \ > + RS_TOKEN (PATTERN_BIND, "@") \ > + RS_TOKEN (TILDE, "~") \ > + RS_TOKEN (BACKSLASH, "\\") \ > + RS_TOKEN (BACKTICK, "`") \ > + RS_TOKEN (CARET, "^") \ > + RS_TOKEN (CARET_EQ, "^=") \ > + RS_TOKEN (PIPE, "|") \ > + RS_TOKEN (PIPE_EQ, "|=") \ > + RS_TOKEN (OR, "||") \ > + RS_TOKEN (QUESTION_MARK, "?") \ > + RS_TOKEN (HASH, "#") \ > + /* from here on, dodgy and may not be correct. not operators and may be \ > + * symbols */ \ > + /* RS_TOKEN(SPACE, " ") probably too dodgy */ \ > + /* RS_TOKEN(NEWLINE, "\n")*/ \ > + RS_TOKEN (SCOPE_RESOLUTION, "::") /* dodgy */ \ > + RS_TOKEN (SINGLE_QUOTE, "'") /* should i differentiate from lifetime? */ \ > + RS_TOKEN (DOUBLE_QUOTE, "\"") \ > + RS_TOKEN (UNDERSCORE, \ > + "_") /* TODO: treat as reserved word like mrustc instead? */ \ > + RS_TOKEN (IDENTIFIER, "identifier") \ > + RS_TOKEN (INT_LITERAL, \ > + "integer literal") /* do different int and float types need \ > + different literal types? */ \ > + RS_TOKEN (FLOAT_LITERAL, "float literal") \ > + RS_TOKEN (STRING_LITERAL, "string literal") \ > + RS_TOKEN (CHAR_LITERAL, "character literal") \ > + RS_TOKEN (BYTE_STRING_LITERAL, "byte string literal") \ > + RS_TOKEN (BYTE_CHAR_LITERAL, "byte character literal") \ > + RS_TOKEN (LIFETIME, "lifetime") /* TODO: improve token type */ \ > + /* Have "interpolated" tokens (whatever that means)? identifer, path, type, \ > + * pattern, */ \ > + /* expression, statement, block, meta, item in mrustc (but not directly in \ > + * lexer). */ \ > + RS_TOKEN (LEFT_PAREN, "(") \ > + RS_TOKEN (RIGHT_PAREN, ")") \ > + RS_TOKEN (LEFT_CURLY, "{") \ > + RS_TOKEN (RIGHT_CURLY, "}") \ > + RS_TOKEN (LEFT_SQUARE, "[") \ > + RS_TOKEN (RIGHT_SQUARE, "]") \ > + /* Macros */ \ > + RS_TOKEN (DOLLAR_SIGN, "$") \ > + /* Doc Comments */ \ > + RS_TOKEN (INNER_DOC_COMMENT, "#![doc]") \ > + RS_TOKEN (OUTER_DOC_COMMENT, "#[doc]") \ > + /* have "weak" union and 'static keywords? */ \ > + \ > + RS_TOKEN_KEYWORD (ABSTRACT, "abstract") /* unused */ \ > + RS_TOKEN_KEYWORD (AS, "as") \ > + RS_TOKEN_KEYWORD (ASYNC, "async") /* unused */ \ > + RS_TOKEN_KEYWORD (BECOME, "become") /* unused */ \ > + RS_TOKEN_KEYWORD (BOX, "box") /* unused */ \ > + RS_TOKEN_KEYWORD (BREAK, "break") \ > + RS_TOKEN_KEYWORD (CONST, "const") \ > + RS_TOKEN_KEYWORD (CONTINUE, "continue") \ > + RS_TOKEN_KEYWORD (CRATE, "crate") \ > + /* FIXME: Do we need to add $crate (DOLLAR_CRATE) as a reserved kw? */ \ > + RS_TOKEN_KEYWORD (DO, "do") /* unused */ \ > + RS_TOKEN_KEYWORD (DYN, "dyn") \ > + RS_TOKEN_KEYWORD (ELSE, "else") \ > + RS_TOKEN_KEYWORD (ENUM_TOK, "enum") \ > + RS_TOKEN_KEYWORD (EXTERN_TOK, "extern") \ > + RS_TOKEN_KEYWORD (FALSE_LITERAL, "false") \ > + RS_TOKEN_KEYWORD (FINAL_TOK, "final") /* unused */ \ > + RS_TOKEN_KEYWORD (FN_TOK, "fn") \ > + RS_TOKEN_KEYWORD (FOR, "for") \ > + RS_TOKEN_KEYWORD (IF, "if") \ > + RS_TOKEN_KEYWORD (IMPL, "impl") \ > + RS_TOKEN_KEYWORD (IN, "in") \ > + RS_TOKEN_KEYWORD (LET, "let") \ > + RS_TOKEN_KEYWORD (LOOP, "loop") \ > + RS_TOKEN_KEYWORD (MACRO, "macro") /* unused */ \ > + RS_TOKEN_KEYWORD (MATCH_TOK, "match") \ > + RS_TOKEN_KEYWORD (MOD, "mod") \ > + RS_TOKEN_KEYWORD (MOVE, "move") \ > + RS_TOKEN_KEYWORD (MUT, "mut") \ > + RS_TOKEN_KEYWORD (OVERRIDE_TOK, "override") /* unused */ \ > + RS_TOKEN_KEYWORD (PRIV, "priv") /* unused */ \ > + RS_TOKEN_KEYWORD (PUB, "pub") \ > + RS_TOKEN_KEYWORD (REF, "ref") \ > + RS_TOKEN_KEYWORD (RETURN_TOK, "return") \ > + RS_TOKEN_KEYWORD (SELF_ALIAS, \ > + "Self") /* mrustc does not treat this as a reserved word*/ \ > + RS_TOKEN_KEYWORD (SELF, "self") \ > + RS_TOKEN_KEYWORD (STATIC_TOK, "static") \ > + RS_TOKEN_KEYWORD (STRUCT_TOK, "struct") \ > + RS_TOKEN_KEYWORD (SUPER, "super") \ > + RS_TOKEN_KEYWORD (TRAIT, "trait") \ > + RS_TOKEN_KEYWORD (TRUE_LITERAL, "true") \ > + RS_TOKEN_KEYWORD (TRY, "try") /* unused */ \ > + RS_TOKEN_KEYWORD (TYPE, "type") \ > + RS_TOKEN_KEYWORD (TYPEOF, "typeof") /* unused */ \ > + RS_TOKEN_KEYWORD (UNSAFE, "unsafe") \ > + RS_TOKEN_KEYWORD (UNSIZED, "unsized") /* unused */ \ > + RS_TOKEN_KEYWORD (USE, "use") \ > + RS_TOKEN_KEYWORD (VIRTUAL, "virtual") /* unused */ \ > + RS_TOKEN_KEYWORD (WHERE, "where") \ > + RS_TOKEN_KEYWORD (WHILE, "while") \ > + RS_TOKEN_KEYWORD (YIELD, "yield") /* unused */ \ > + \ > + RS_TOKEN (LAST_TOKEN, "") > + > +// Contains all token types. Crappy implementation via x-macros. > +enum TokenId > +{ > +#define RS_TOKEN(name, _) name, > +#define RS_TOKEN_KEYWORD(x, y) RS_TOKEN (x, y) > + RS_TOKEN_LIST > +#undef RS_TOKEN_KEYWORD > +#undef RS_TOKEN > +}; > + > +// dodgy "TokenPtr" declaration with Token forward declaration > +class Token; > +// A smart pointer (shared_ptr) to Token. > +typedef std::shared_ptr TokenPtr; > +// A smart pointer (shared_ptr) to a constant Token. > +typedef std::shared_ptr const_TokenPtr; > + > +// Hackily defined way to get token description for enum value using x-macros > +const char * > +get_token_description (TokenId id); > +/* Hackily defined way to get token description as a string for enum value using > + * x-macros */ > +const char * > +token_id_to_str (TokenId id); > +// Get type hint description as a string. > +const char * > +get_type_hint_string (PrimitiveCoreType type); > + > +// Represents a single token. Create using factory static methods. > +class Token > +{ > +private: > + // Token kind. > + TokenId token_id; > + // Token location. > + Location locus; > + // Associated text (if any) of token. > + std::unique_ptr str; > + // TODO: maybe remove issues and just store std::string as value? > + /* Type hint for token based on lexer data (e.g. type suffix). Does not exist > + * for most tokens. */ > + PrimitiveCoreType type_hint; > + > + // Token constructor from token id and location. Has a null string. > + Token (TokenId token_id, Location location) > + : token_id (token_id), locus (location), str (nullptr), > + type_hint (CORETYPE_UNKNOWN) > + {} > + > + // Token constructor from token id, location, and a string. > + Token (TokenId token_id, Location location, std::string &¶mStr) > + : token_id (token_id), locus (location), > + str (new std::string (std::move (paramStr))), type_hint (CORETYPE_UNKNOWN) > + {} > + > + // Token constructor from token id, location, and a char. > + Token (TokenId token_id, Location location, char paramChar) > + : token_id (token_id), locus (location), > + str (new std::string (1, paramChar)), type_hint (CORETYPE_UNKNOWN) > + {} > + > + // Token constructor from token id, location, and a "codepoint". > + Token (TokenId token_id, Location location, Codepoint paramCodepoint) > + : token_id (token_id), locus (location), > + str (new std::string (paramCodepoint.as_string ())), > + type_hint (CORETYPE_UNKNOWN) > + {} > + > + // Token constructor from token id, location, a string, and type hint. > + Token (TokenId token_id, Location location, std::string &¶mStr, > + PrimitiveCoreType parType) > + : token_id (token_id), locus (location), > + str (new std::string (std::move (paramStr))), type_hint (parType) > + {} > + > +public: > + // No default constructor. > + Token () = delete; > + // Do not copy/assign tokens. > + Token (const Token &) = delete; > + Token &operator= (const Token &) = delete; > + > + // Allow moving tokens. > + Token (Token &&other) = default; > + Token &operator= (Token &&other) = default; > + > + ~Token () = default; > + > + /* TODO: make_shared (which saves a heap allocation) does not work with the > + * private constructor */ > + > + // Makes and returns a new TokenPtr (with null string). > + static TokenPtr make (TokenId token_id, Location locus) > + { > + // return std::make_shared (token_id, locus); > + return TokenPtr (new Token (token_id, locus)); > + } > + > + // Makes and returns a new TokenPtr of type IDENTIFIER. > + static TokenPtr make_identifier (Location locus, std::string &&str) > + { > + // return std::make_shared (IDENTIFIER, locus, str); > + return TokenPtr (new Token (IDENTIFIER, locus, std::move (str))); > + } > + > + // Makes and returns a new TokenPtr of type INT_LITERAL. > + static TokenPtr make_int (Location locus, std::string &&str, > + PrimitiveCoreType type_hint = CORETYPE_UNKNOWN) > + { > + // return std::make_shared (INT_LITERAL, locus, str, type_hint); > + return TokenPtr ( > + new Token (INT_LITERAL, locus, std::move (str), type_hint)); > + } > + > + // Makes and returns a new TokenPtr of type FLOAT_LITERAL. > + static TokenPtr make_float (Location locus, std::string &&str, > + PrimitiveCoreType type_hint = CORETYPE_UNKNOWN) > + { > + // return std::make_shared (FLOAT_LITERAL, locus, str, type_hint); > + return TokenPtr ( > + new Token (FLOAT_LITERAL, locus, std::move (str), type_hint)); > + } > + > + // Makes and returns a new TokenPtr of type STRING_LITERAL. > + static TokenPtr make_string (Location locus, std::string &&str) > + { > + // return std::make_shared (STRING_LITERAL, locus, str, > + // CORETYPE_STR); > + return TokenPtr ( > + new Token (STRING_LITERAL, locus, std::move (str), CORETYPE_STR)); > + } > + > + // Makes and returns a new TokenPtr of type CHAR_LITERAL. > + static TokenPtr make_char (Location locus, Codepoint char_lit) > + { > + // return std::make_shared (CHAR_LITERAL, locus, char_lit); > + return TokenPtr (new Token (CHAR_LITERAL, locus, char_lit)); > + } > + > + // Makes and returns a new TokenPtr of type BYTE_CHAR_LITERAL. > + static TokenPtr make_byte_char (Location locus, char byte_char) > + { > + // return std::make_shared (BYTE_CHAR_LITERAL, locus, byte_char); > + return TokenPtr (new Token (BYTE_CHAR_LITERAL, locus, byte_char)); > + } > + > + // Makes and returns a new TokenPtr of type BYTE_STRING_LITERAL (fix). > + static TokenPtr make_byte_string (Location locus, std::string &&str) > + { > + // return std::make_shared (BYTE_STRING_LITERAL, locus, str); > + return TokenPtr (new Token (BYTE_STRING_LITERAL, locus, std::move (str))); > + } > + > + // Makes and returns a new TokenPtr of type INNER_DOC_COMMENT. > + static TokenPtr make_inner_doc_comment (Location locus, std::string &&str) > + { > + return TokenPtr (new Token (INNER_DOC_COMMENT, locus, std::move (str))); > + } > + > + // Makes and returns a new TokenPtr of type OUTER_DOC_COMMENT. > + static TokenPtr make_outer_doc_comment (Location locus, std::string &&str) > + { > + return TokenPtr (new Token (OUTER_DOC_COMMENT, locus, std::move (str))); > + } > + > + // Makes and returns a new TokenPtr of type LIFETIME. > + static TokenPtr make_lifetime (Location locus, std::string &&str) > + { > + // return std::make_shared (LIFETIME, locus, str); > + return TokenPtr (new Token (LIFETIME, locus, std::move (str))); > + } > + > + // Gets id of the token. > + TokenId get_id () const { return token_id; } > + > + // Gets location of the token. > + Location get_locus () const { return locus; } > + > + // Gets string description of the token. > + const std::string & > + get_str () const; /*{ > +// FIXME: put in header again when fix null problem > +//gcc_assert(str != nullptr); > +if (str == nullptr) { > +error_at(get_locus(), "attempted to get string for '%s', which has no string. > +returning empty string instead.", get_token_description()); return ""; > +} > +return *str; > +}*/ > + > + // Gets token's type hint info. > + PrimitiveCoreType get_type_hint () const > + { > + return type_hint == CORETYPE_PURE_DECIMAL ? CORETYPE_UNKNOWN : type_hint; > + } > + > + // diagnostics (error reporting) > + const char *get_token_description () const > + { > + return Rust::get_token_description (token_id); > + } > + > + // debugging > + const char *token_id_to_str () const > + { > + return Rust::token_id_to_str (token_id); > + } > + > + // debugging > + const char *get_type_hint_str () const; > + > + /* Returns whether the token is a literal of any type (int, float, char, > + * string, byte char, byte string). */ > + bool is_literal () const > + { > + switch (token_id) > + { > + case INT_LITERAL: > + case FLOAT_LITERAL: > + case CHAR_LITERAL: > + case STRING_LITERAL: > + case BYTE_CHAR_LITERAL: > + case BYTE_STRING_LITERAL: > + return true; > + default: > + return false; > + } > + } > + > + /* Returns whether the token actually has a string (regardless of whether it > + * should or not). */ > + bool has_str () const { return str != nullptr; } > + > + // Returns whether the token should have a string. > + bool should_have_str () const > + { > + return is_literal () || token_id == IDENTIFIER || token_id == LIFETIME; > + } > + > + // Returns whether the token is a pure decimal int literal > + bool is_pure_decimal () const { return type_hint == CORETYPE_PURE_DECIMAL; } > +}; > +} // namespace Rust > + > +#endif > diff --git a/gcc/rust/rust-buffered-queue.h b/gcc/rust/rust-buffered-queue.h > new file mode 100644 > index 00000000000..afcc4670cac > --- /dev/null > +++ b/gcc/rust/rust-buffered-queue.h > @@ -0,0 +1,204 @@ > +// Copyright (C) 2020-2022 Free Software Foundation, Inc. > + > +// This file is part of GCC. > + > +// GCC is free software; you can redistribute it and/or modify it under > +// the terms of the GNU General Public License as published by the Free > +// Software Foundation; either version 3, or (at your option) any later > +// version. > + > +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY > +// WARRANTY; without even the implied warranty of MERCHANTABILITY or > +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License > +// for more details. > + > +// You should have received a copy of the GNU General Public License > +// along with GCC; see the file COPYING3. If not see > +// . > + > +#ifndef RUST_BUFFERED_QUEUE_H > +#define RUST_BUFFERED_QUEUE_H > + > +#include "rust-system.h" > + > +namespace Rust { > +/* Buffered queue implementation. Items are of type T, queue source is of type > + * Source. Note that this is owning of the source. */ > +template class buffered_queue > +{ > +public: > + // Construct empty queue from Source src. > + buffered_queue (Source src) : source (src), start (0), end (0), buffer () {} > + > + /* disable copying (since source is probably non-copyable) > + * TODO is this actually a good idea? If source is non-copyable, it would > + * just delete the copy constructor anyway.*/ > + buffered_queue (const buffered_queue &other) = delete; > + buffered_queue &operator= (const buffered_queue &other) = delete; > + > + // enable moving > + buffered_queue (buffered_queue &&other) = default; > + buffered_queue &operator= (buffered_queue &&other) = default; > + > + // Returns token at position start + n (i.e. n tokens ahead). > + T peek (int n) > + { > + // n should not be behind > + rust_assert (n >= 0); > + > + int num_queued_items = end - start; > + int num_items_required = n + 1; > + > + // if required items go past end of queue, add them to queue > + if (num_items_required > num_queued_items) > + { > + int num_items_to_read = num_items_required - num_queued_items; > + > + /* if queue length + extra items is larger than buffer size, expand > + * buffer */ > + if (end + num_items_to_read > (int) buffer.size ()) > + { > + // Resize the buffer by 1.5x > + int new_size = (buffer.size () + num_items_to_read); > + new_size += (new_size >> 1); > + > + // old method: > + /* > + // create new queue buffer with new size > + std::vector new_queue (new_size); > + std::copy (buffer.begin () + start, buffer.begin () + end, > + new_queue.begin ()); > + start = 0; > + end = num_queued_items; > + // TODO: would move be better here? optimisation for move with > + // shared pointer? > + > + // swap member buffer and new queue buffer > + std::swap (buffer, new_queue); > + */ > + > + // TODO: determine overhead of this approach vs copy. Should be > + // lower. > + std::vector new_queue; > + new_queue.reserve (new_size); > + new_queue.insert (new_queue.begin (), > + std::make_move_iterator (buffer.begin () + start), > + std::make_move_iterator (buffer.begin () + end)); > + start = 0; > + end = num_queued_items; > + // fill up rest of vector with junk so that indexing can work > + new_queue.insert (new_queue.begin () + end, > + new_size - new_queue.size (), T ()); > + > + buffer = std::move (new_queue); > + /* this should be best method - std::move(range) would have > + * allocation problems; initial construction would require > + * reallocation upon resizing */ > + > + // validate that buffer is large enough now > + rust_assert (end + num_items_to_read <= (int) buffer.size ()); > + } > + > + /* iterate through buffer and invoke operator () on source on values > + * past original end */ > + for (int i = 0; i < num_items_to_read; i++) > + buffer[end + i] = source.next (); > + > + // move end based on additional items added > + end += num_items_to_read; > + } > + > + rust_assert (0 <= start); > + rust_assert (start <= end); > + rust_assert (end <= (int) buffer.size ()); > + > + rust_assert (start + n < end); > + > + // return value at start + n in buffer > + return buffer[start + n]; > + } > + > + /* TODO: add faster peek current token to remove overhead of conditional > + * branches? */ > + > + // Advances start by n + 1. > + void skip (int n) > + { > + // Call peek to ensure requested n is actually in queue. > + peek (n); > + > + // Clear queue values from start to n (inclusive). > + for (int i = 0; i < (n + 1); i++) > + buffer[start + i] = T (); > + > + // Move start forward by n + 1. > + start += (n + 1); > + > + // Ensure start is not impossible somehow > + rust_assert (0 <= start); > + rust_assert (start <= end); > + > + // Compact buffer if empty > + if (start == end) > + start = end = 0; > + } > + > + /* Inserts element at front of vector. Really dirty hack with terrible > + * performance, only use when really needed. */ > + void insert_at_front (T elem_to_insert) > + { > + // TODO: test as this may not work properly > + > + // Insert actual element in buffer at start. > + buffer.insert (buffer.begin (), elem_to_insert); > + > + /* Increase the end number since added element means all others have shifted > + * one along */ > + end++; > + } > + > + // Insert at arbitrary position (attempt) > + void insert (int index, T elem_to_insert) > + { > + // TODO: test as this may not work properly > + > + // n should not be behind > + rust_assert (index >= 0); > + > + // call peek to ensure that the items behind this (at least) are in queue > + if (index >= 1) > + peek (index - 1); > + else > + peek (index); > + > + buffer.insert (buffer.begin () + start + index, std::move (elem_to_insert)); > + > + end++; > + } > + > + // Replaces the current value in the buffer. Total HACK. > + void replace_current_value (T replacement) > + { > + // call peek to ensure value exists > + peek (0); > + > + buffer[start] = std::move (replacement); > + > + // don't move start or end > + } > + > +private: > + // Source of tokens for queue. > + Source source; > + > + // Begin of range in buffer, inclusive. > + int start; > + // End of range in buffer, exclusive. > + int end; > + > + // Queue buffer. > + std::vector buffer; > +}; > +} // namespace Rust > + > +#endif > -- > 2.25.1 >