From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from outbound.soverin.net (outbound.soverin.net [IPv6:2a01:4f8:fff0:2d:8::215]) by sourceware.org (Postfix) with ESMTPS id 446023858D3C for ; Sat, 2 Oct 2021 22:15:11 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 446023858D3C Authentication-Results: sourceware.org; dmarc=none (p=none dis=none) header.from=klomp.org Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=klomp.org Received: from smtp.soverin.net (unknown [10.10.3.28]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits)) (No client certificate requested) by outbound.soverin.net (Postfix) with ESMTPS id D1FB260123 for ; Sat, 2 Oct 2021 22:15:08 +0000 (UTC) Received: from smtp.soverin.net (smtp.soverin.net [159.69.232.142]) by soverin.net Received: by reform (Postfix, from userid 1000) id EFD062E83151; Sun, 3 Oct 2021 00:15:04 +0200 (CEST) From: Mark Wielaard To: gcc-rust@gcc.gnu.org Cc: Mark Wielaard Subject: [PATCH] Fix lexer to not produce bad unicode escape values Date: Sun, 3 Oct 2021 00:13:46 +0200 Message-Id: <20211002221346.35708-1-mark@klomp.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Spam-Status: No, score=-11.4 required=5.0 tests=BAYES_00, GIT_PATCH_0, KAM_DMARC_STATUS, KAM_SHORT, RCVD_IN_DNSWL_LOW, SPF_HELO_PASS, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: gcc-rust@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: gcc-rust mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 02 Oct 2021 22:15:13 -0000 There were a couple of issues in the lexer unicode escape code. Unicode escape sequences must always start with an opening curly bracket (and end with a closing one). Underscores are not allowed as starting character. And the produced values must be unicode scalar values, which excludes surrogate values (D800 to DFFF) or values larger than 10FFFF. Also try to recover more gracefully from errors by trying to skip past any bad characters to the end of the escape sequence. Test all of the above in a new testcase unicode_escape.rs. --- https://git.sr.ht/~mjw/gccrs/commit/unicode_escape gcc/rust/lex/rust-lex.cc | 88 ++++++++++++++++---- gcc/testsuite/rust/compile/unicode_escape.rs | 60 +++++++++++++ 2 files changed, 132 insertions(+), 16 deletions(-) create mode 100644 gcc/testsuite/rust/compile/unicode_escape.rs diff --git a/gcc/rust/lex/rust-lex.cc b/gcc/rust/lex/rust-lex.cc index bbddea04d0c..2b3c89b48be 100644 --- a/gcc/rust/lex/rust-lex.cc +++ b/gcc/rust/lex/rust-lex.cc @@ -1273,6 +1273,8 @@ Lexer::parse_escape (char opening_char) rust_error_at (get_current_location (), "cannot have a unicode escape \\u in a byte %s", opening_char == '\'' ? "character" : "string"); + // Try to parse it anyway, just to skip it + parse_partial_unicode_escape (); return std::make_tuple (output_char, additional_length_offset, false); case '\r': case '\n': @@ -1461,16 +1463,34 @@ Lexer::parse_partial_unicode_escape () { skip_input (); current_char = peek_input (); - int additional_length_offset = 1; + int additional_length_offset = 0; - bool need_close_brace = false; - if (current_char == '{') + if (current_char != '{') { - need_close_brace = true; + rust_error_at (get_current_location (), + "unicode escape should start with %<{%>"); + /* Skip what should probaby have been between brackets. */ + while (is_x_digit (current_char) || current_char == '_') + { + skip_input (); + current_char = peek_input (); + additional_length_offset++; + } + return std::make_pair (Codepoint (0), additional_length_offset); + } + skip_input (); + current_char = peek_input (); + additional_length_offset++; + + if (current_char == '_') + { + rust_error_at (get_current_location (), + "unicode escape cannot start with %<_%>"); skip_input (); current_char = peek_input (); additional_length_offset++; + // fallthrough and try to parse the rest anyway } // parse unicode escape - 1-6 hex digits @@ -1500,21 +1520,45 @@ Lexer::parse_partial_unicode_escape () current_char = peek_input (); } - // ensure closing brace if required - if (need_close_brace) + if (current_char == '}') { - if (current_char == '}') + skip_input (); + current_char = peek_input (); + additional_length_offset++; + } + else + { + // actually an error, but allow propagation anyway Assume that + // wrong bracketm whitespace or single/double quotes are wrong + // termination, otherwise it is a wrong character, then skip to the actual + // terminator. + if (current_char == '{' || is_whitespace (current_char) + || current_char == '\'' || current_char == '"') { - skip_input (); - current_char = peek_input (); - additional_length_offset++; + rust_error_at (get_current_location (), + "expected terminating %<}%> in unicode escape"); + return std::make_pair (Codepoint (0), additional_length_offset); } else { - // actually an error, but allow propagation anyway rust_error_at (get_current_location (), - "expected terminating %<}%> in unicode escape"); - // return false; + "invalid character %<%c%> in unicode escape", + current_char); + while (current_char != '}' && current_char != '{' + && !is_whitespace (current_char) && current_char != '\'' + && current_char != '"') + { + skip_input (); + current_char = peek_input (); + additional_length_offset++; + } + // Consume the actual closing bracket if found + if (current_char == '}') + { + skip_input (); + current_char = peek_input (); + additional_length_offset++; + } return std::make_pair (Codepoint (0), additional_length_offset); } } @@ -1530,10 +1574,22 @@ Lexer::parse_partial_unicode_escape () return std::make_pair (Codepoint (0), additional_length_offset); } - long hex_num = std::strtol (num_str.c_str (), nullptr, 16); + unsigned long hex_num = std::strtoul (num_str.c_str (), nullptr, 16); - // assert fits a uint32_t - gcc_assert (hex_num < 4294967296); + if (hex_num > 0xd7ff && hex_num < 0xe000) + { + rust_error_at ( + get_current_location (), + "unicode escape cannot be a surrogate value (D800 to DFFF)"); + return std::make_pair (Codepoint (0), additional_length_offset); + } + + if (hex_num > 0x10ffff) + { + rust_error_at (get_current_location (), + "unicode escape cannot be larger than 10FFFF"); + return std::make_pair (Codepoint (0), additional_length_offset); + } // return true; return std::make_pair (Codepoint (static_cast (hex_num)), diff --git a/gcc/testsuite/rust/compile/unicode_escape.rs b/gcc/testsuite/rust/compile/unicode_escape.rs new file mode 100644 index 00000000000..39b91d8a95c --- /dev/null +++ b/gcc/testsuite/rust/compile/unicode_escape.rs @@ -0,0 +1,60 @@ +fn main () +{ + // Braces are required + let _cbl = '\u013'; // { dg-error "unicode escape" } + let _sbl = "\u013"; //{ dg-error "unicode escape" } + + // One to six hex digits + let _c0 = '\u{}'; // { dg-error "unicode escape" } + let _c1 = '\u{0}'; + let _c2 = '\u{00}'; + let _c3 = '\u{000}'; + let _c4 = '\u{0000}'; + let _c5 = '\u{00000}'; + let _c6 = '\u{000000}'; + let _c7 = '\u{0000000}'; // { dg-error "unicode escape" } + + let _s0 = "\u{}"; // { dg-error "unicode escape" } + let _s1 = "\u{0}"; + let _s2 = "\u{00}"; + let _s3 = "\u{000}"; + let _s4 = "\u{0000}"; + let _s5 = "\u{00000}"; + let _s6 = "\u{000000}"; + let _s7 = "\u{0000000}"; // { dg-error "unicode escape" } + + // Underscores OK except for start + let _c_ = '\u{00___01__0_1_}'; + let _s_ = "\u{00___01__0_1_}"; + let _c__ = '\u{_00__01__0_}'; // { dg-error "unicode escape" } + let _s__ = "\u{_00__01__0_}"; // { dg-error "unicode escape" } + + // Must be hex chars + let _chex = '\u{hex}'; // { dg-error "unicode escape" } + let _shex = '\u{hex}'; // { dg-error "unicode escape" } + + // Only valid from 0x0 to 0xD7FF and from 0xE000 to 0x10FFF + let _cd7ff = '\u{D7FF}'; + let _sd7ff = "\u{D7FF}"; + let _cd800 = '\u{D800}'; // { dg-error "unicode escape" } + let _sd800 = "\u{D800}"; // { dg-error "unicode escape" } + + let _cdfff = '\u{DFFF}'; // { dg-error "unicode escape" } + let _sdfff = "\u{DFFF}"; // { dg-error "unicode escape" } + let _ce000 = '\u{E000}'; + let _se000 = "\u{E000}"; + + let _clast = '\u{10FFFF}'; + let _slast = "\u{10FFFF}"; + let _clast1 = '\u{110000}'; // { dg-error "unicode escape" } + let _slast1 = "\u{110000}"; // { dg-error "unicode escape" } + + let _cffffff = '\u{FFFFFF}'; // { dg-error "unicode escape" } + let _sffffff = "\u{FFFFFF}"; // { dg-error "unicode escape" } + + // unicode escapes cannot be used in bytes or byte strings. + // Except in raw byte strings (where they aren't escapes). + let _bc = b'\u{000A}'; // { dg-error "unicode escape" } + let _bs = b"\u{000A}"; // { dg-error "unicode escape" } + let _rbs = br"\u{000A}"; +} -- 2.32.0