From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 7905) id 3CA2E3857C4C; Tue, 16 Jan 2024 18:06:44 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 3CA2E3857C4C DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gcc.gnu.org; s=default; t=1705428404; bh=DhQfDOSUDc1l9Or6xJQ3roGezaIihkmLI9WPj9J9RLc=; h=From:To:Subject:Date:From; b=MbwuS0UM/4voLyZgAXhzRZ5fwsm+MpLJHPnKGgdS17stYm4CMzoA8Tcl8hdtzo/re GJoWdCpohhq7USp8OD+Jt/b/PG/CUnPpNZ4tDOWAdctSY0GFJgkq3ibJal+hlQcIw6 2J3digRQIA7YnBOUozrIkXcetGWIzlWUQBujGa3Y= MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Type: text/plain; charset="utf-8" From: Arthur Cohen To: gcc-cvs@gcc.gnu.org Subject: [gcc r14-7912] gccrs: clean up Codepoint and InputSource X-Act-Checkin: gcc X-Git-Author: Raiki Tamura X-Git-Refname: refs/heads/trunk X-Git-Oldrev: 1e288d66cb4f0a25a249c0c6dacc2efbf9e44dc8 X-Git-Newrev: 5b47923fe512f088a4f1c31466236843c20b7ff9 Message-Id: <20240116180644.3CA2E3857C4C@sourceware.org> Date: Tue, 16 Jan 2024 18:06:44 +0000 (GMT) List-Id: https://gcc.gnu.org/g:5b47923fe512f088a4f1c31466236843c20b7ff9 commit r14-7912-g5b47923fe512f088a4f1c31466236843c20b7ff9 Author: Raiki Tamura Date: Sun Aug 6 19:17:17 2023 +0900 gccrs: clean up Codepoint and InputSource gcc/rust/ChangeLog: * lex/rust-codepoint.h: Moved to... * util/rust-codepoint.h: ...here. * lex/rust-input-source.h: Add missing license * util/rust-unicode.cc: Add missing license * util/rust-punycode.cc (extract_basic_string): Remove constant Signed-off-by: Raiki Tamura Diff: --- gcc/rust/lex/rust-input-source.h | 70 ++++++++++++++++++++++----------- gcc/rust/{lex => util}/rust-codepoint.h | 0 gcc/rust/util/rust-punycode.cc | 4 +- gcc/rust/util/rust-unicode.cc | 18 +++++++++ 4 files changed, 66 insertions(+), 26 deletions(-) diff --git a/gcc/rust/lex/rust-input-source.h b/gcc/rust/lex/rust-input-source.h index 07137debb8f..32261a05cae 100644 --- a/gcc/rust/lex/rust-input-source.h +++ b/gcc/rust/lex/rust-input-source.h @@ -1,3 +1,21 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// . + #ifndef RUST_INPUT_SOURCE_H #define RUST_INPUT_SOURCE_H @@ -5,6 +23,14 @@ #include "optional.h" namespace Rust { + +constexpr uint8_t UTF8_BOM1 = 0xEF; +constexpr uint8_t UTF8_BOM2 = 0xBB; +constexpr uint8_t UTF8_BOM3 = 0xBF; + +constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F; +constexpr uint32_t CODEPOINT_INVALID = 0xFFFE; + // Input source wrapper thing. class InputSource { @@ -23,7 +49,7 @@ private: if ((int32_t) input == EOF) return Codepoint::eof (); - else if (input < 128) + else if (input <= MAX_ASCII_CODEPOINT) { // ascii -- 1 byte return {input}; @@ -31,14 +57,14 @@ private: else if ((input & 0xC0) == 0x80) { // invalid (continuation; can't be first char) - return {0xFFFE}; + return {CODEPOINT_INVALID}; } else if ((input & 0xE0) == 0xC0) { // 2 bytes uint8_t input2 = next_byte (); if ((input2 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0); return output; @@ -50,23 +76,23 @@ private: // If the second byte is equal to 0xBB then the input is no longer a // valid UTF-8 char. Then, we check if the third byte makes up a UTF // BOM. - if (input == 0xEF && input2 == 0xBB) + if (input == UTF8_BOM1 && input2 == UTF8_BOM2) { uint8_t input3 = next_byte (); - if (input3 == 0xBF) + if (input3 == UTF8_BOM3) // found BOM return next_codepoint (); else - return {0xFFFE}; + return {CODEPOINT_INVALID}; } if ((input2 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint8_t input3 = next_byte (); if ((input3 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6) | ((input3 & 0x3F) << 0); @@ -77,15 +103,15 @@ private: // 4 bytes uint8_t input2 = next_byte (); if ((input2 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint8_t input3 = next_byte (); if ((input3 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint8_t input4 = next_byte (); if ((input4 & 0xC0) != 0x80) - return {0xFFFE}; + return {CODEPOINT_INVALID}; uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12) | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0); @@ -93,23 +119,26 @@ private: } else { - return {0xFFFE}; + return {CODEPOINT_INVALID}; } } protected: - // Check if the input source is valid as utf-8 and copy all characters to - // `chars`. + // This method must be called by the constructor to initialize the input + // source. We cannot move this to the constructor because it calls a + // virtual method . void init () { + // Check if the input source is valid as utf-8 and copy all characters to + // `chars`. Codepoint char32 = next_codepoint (); - while (!char32.is_eof () && char32 != 0xFFFE) + while (!char32.is_eof () && char32 != CODEPOINT_INVALID) { chars.push_back (char32); char32 = next_codepoint (); } - if (char32 == 0xFFFE) + if (char32 == CODEPOINT_INVALID) { // Input source is not valid as utf-8. is_valid_utf8 = false; @@ -158,11 +187,7 @@ private: public: // Create new input source from file. - FileInputSource (FILE *input) : InputSource (), input (input) - { - // TODO make this better? - init (); - } + FileInputSource (FILE *input) : InputSource (), input (input) { init (); } }; class BufferInputSource : public InputSource @@ -175,7 +200,7 @@ private: { if (offs >= buffer.size ()) return EOF; - return (uint8_t) buffer.at (offs++); + return static_cast (buffer.at (offs++)); } public: @@ -183,7 +208,6 @@ public: BufferInputSource (const std::string &b, size_t offset) : InputSource (), buffer (b), offs (offset) { - // TODO make this better? init (); } }; diff --git a/gcc/rust/lex/rust-codepoint.h b/gcc/rust/util/rust-codepoint.h similarity index 100% rename from gcc/rust/lex/rust-codepoint.h rename to gcc/rust/util/rust-codepoint.h diff --git a/gcc/rust/util/rust-punycode.cc b/gcc/rust/util/rust-punycode.cc index a35d54aa6f5..6c796ab794f 100644 --- a/gcc/rust/util/rust-punycode.cc +++ b/gcc/rust/util/rust-punycode.cc @@ -36,15 +36,13 @@ constexpr uint32_t INITIAL_BIAS = 72; constexpr uint32_t INITIAL_N = 128; constexpr char DELIMITER = '-'; -constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F; - std::string extract_basic_string (const std::vector &src) { std::string basic_string; for (auto c : src) { - if (c.value <= MAX_ASCII_CODEPOINT) + if (c.value <= 0x7F) basic_string += c.as_string (); } return basic_string; diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc index b2ddaf0b9ce..95653cb760d 100644 --- a/gcc/rust/util/rust-unicode.cc +++ b/gcc/rust/util/rust-unicode.cc @@ -1,3 +1,21 @@ +// Copyright (C) 2020-2023 Free Software Foundation, Inc. + +// This file is part of GCC. + +// GCC is free software; you can redistribute it and/or modify it under +// the terms of the GNU General Public License as published by the Free +// Software Foundation; either version 3, or (at your option) any later +// version. + +// GCC is distributed in the hope that it will be useful, but WITHOUT ANY +// WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +// for more details. + +// You should have received a copy of the GNU General Public License +// along with GCC; see the file COPYING3. If not see +// . + #include "rust-system.h" #include "optional.h" #include "selftest.h"