public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-7912] gccrs: clean up Codepoint and InputSource
@ 2024-01-16 18:06 Arthur Cohen
  0 siblings, 0 replies; only message in thread
From: Arthur Cohen @ 2024-01-16 18:06 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:5b47923fe512f088a4f1c31466236843c20b7ff9

commit r14-7912-g5b47923fe512f088a4f1c31466236843c20b7ff9
Author: Raiki Tamura <tamaron1203@gmail.com>
Date:   Sun Aug 6 19:17:17 2023 +0900

    gccrs: clean up Codepoint and InputSource
    
    gcc/rust/ChangeLog:
    
            * lex/rust-codepoint.h: Moved to...
            * util/rust-codepoint.h: ...here.
            * lex/rust-input-source.h: Add missing license
            * util/rust-unicode.cc: Add missing license
            * util/rust-punycode.cc (extract_basic_string): Remove constant
    
    Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>

Diff:
---
 gcc/rust/lex/rust-input-source.h        | 70 ++++++++++++++++++++++-----------
 gcc/rust/{lex => util}/rust-codepoint.h |  0
 gcc/rust/util/rust-punycode.cc          |  4 +-
 gcc/rust/util/rust-unicode.cc           | 18 +++++++++
 4 files changed, 66 insertions(+), 26 deletions(-)

diff --git a/gcc/rust/lex/rust-input-source.h b/gcc/rust/lex/rust-input-source.h
index 07137debb8f..32261a05cae 100644
--- a/gcc/rust/lex/rust-input-source.h
+++ b/gcc/rust/lex/rust-input-source.h
@@ -1,3 +1,21 @@
+// Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
 #ifndef RUST_INPUT_SOURCE_H
 #define RUST_INPUT_SOURCE_H
 
@@ -5,6 +23,14 @@
 #include "optional.h"
 
 namespace Rust {
+
+constexpr uint8_t UTF8_BOM1 = 0xEF;
+constexpr uint8_t UTF8_BOM2 = 0xBB;
+constexpr uint8_t UTF8_BOM3 = 0xBF;
+
+constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;
+constexpr uint32_t CODEPOINT_INVALID = 0xFFFE;
+
 // Input source wrapper thing.
 class InputSource
 {
@@ -23,7 +49,7 @@ private:
 
     if ((int32_t) input == EOF)
       return Codepoint::eof ();
-    else if (input < 128)
+    else if (input <= MAX_ASCII_CODEPOINT)
       {
 	// ascii -- 1 byte
 	return {input};
@@ -31,14 +57,14 @@ private:
     else if ((input & 0xC0) == 0x80)
       {
 	// invalid (continuation; can't be first char)
-	return {0xFFFE};
+	return {CODEPOINT_INVALID};
       }
     else if ((input & 0xE0) == 0xC0)
       {
 	// 2 bytes
 	uint8_t input2 = next_byte ();
 	if ((input2 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
 	return output;
@@ -50,23 +76,23 @@ private:
 	// If the second byte is equal to 0xBB then the input is no longer a
 	// valid UTF-8 char. Then, we check if the third byte makes up a UTF
 	// BOM.
-	if (input == 0xEF && input2 == 0xBB)
+	if (input == UTF8_BOM1 && input2 == UTF8_BOM2)
 	  {
 	    uint8_t input3 = next_byte ();
-	    if (input3 == 0xBF)
+	    if (input3 == UTF8_BOM3)
 	      // found BOM
 	      return next_codepoint ();
 	    else
-	      return {0xFFFE};
+	      return {CODEPOINT_INVALID};
 	  }
 
 	if ((input2 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint8_t input3 = next_byte ();
 
 	if ((input3 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
 			  | ((input3 & 0x3F) << 0);
@@ -77,15 +103,15 @@ private:
 	// 4 bytes
 	uint8_t input2 = next_byte ();
 	if ((input2 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint8_t input3 = next_byte ();
 	if ((input3 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint8_t input4 = next_byte ();
 	if ((input4 & 0xC0) != 0x80)
-	  return {0xFFFE};
+	  return {CODEPOINT_INVALID};
 
 	uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
 			  | ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
@@ -93,23 +119,26 @@ private:
       }
     else
       {
-	return {0xFFFE};
+	return {CODEPOINT_INVALID};
       }
   }
 
 protected:
-  // Check if the input source is valid as utf-8 and copy all characters to
-  // `chars`.
+  // This method must be called by the constructor to initialize the input
+  // source. We cannot move this to the constructor because it calls a
+  // virtual method .
   void init ()
   {
+    // Check if the input source is valid as utf-8 and copy all characters to
+    // `chars`.
     Codepoint char32 = next_codepoint ();
-    while (!char32.is_eof () && char32 != 0xFFFE)
+    while (!char32.is_eof () && char32 != CODEPOINT_INVALID)
       {
 	chars.push_back (char32);
 	char32 = next_codepoint ();
       }
 
-    if (char32 == 0xFFFE)
+    if (char32 == CODEPOINT_INVALID)
       {
 	// Input source is not valid as utf-8.
 	is_valid_utf8 = false;
@@ -158,11 +187,7 @@ private:
 
 public:
   // Create new input source from file.
-  FileInputSource (FILE *input) : InputSource (), input (input)
-  {
-    // TODO make this better?
-    init ();
-  }
+  FileInputSource (FILE *input) : InputSource (), input (input) { init (); }
 };
 
 class BufferInputSource : public InputSource
@@ -175,7 +200,7 @@ private:
   {
     if (offs >= buffer.size ())
       return EOF;
-    return (uint8_t) buffer.at (offs++);
+    return static_cast<uint8_t> (buffer.at (offs++));
   }
 
 public:
@@ -183,7 +208,6 @@ public:
   BufferInputSource (const std::string &b, size_t offset)
     : InputSource (), buffer (b), offs (offset)
   {
-    // TODO make this better?
     init ();
   }
 };
diff --git a/gcc/rust/lex/rust-codepoint.h b/gcc/rust/util/rust-codepoint.h
similarity index 100%
rename from gcc/rust/lex/rust-codepoint.h
rename to gcc/rust/util/rust-codepoint.h
diff --git a/gcc/rust/util/rust-punycode.cc b/gcc/rust/util/rust-punycode.cc
index a35d54aa6f5..6c796ab794f 100644
--- a/gcc/rust/util/rust-punycode.cc
+++ b/gcc/rust/util/rust-punycode.cc
@@ -36,15 +36,13 @@ constexpr uint32_t INITIAL_BIAS = 72;
 constexpr uint32_t INITIAL_N = 128;
 constexpr char DELIMITER = '-';
 
-constexpr uint32_t MAX_ASCII_CODEPOINT = 0x7F;
-
 std::string
 extract_basic_string (const std::vector<Codepoint> &src)
 {
   std::string basic_string;
   for (auto c : src)
     {
-      if (c.value <= MAX_ASCII_CODEPOINT)
+      if (c.value <= 0x7F)
 	basic_string += c.as_string ();
     }
   return basic_string;
diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc
index b2ddaf0b9ce..95653cb760d 100644
--- a/gcc/rust/util/rust-unicode.cc
+++ b/gcc/rust/util/rust-unicode.cc
@@ -1,3 +1,21 @@
+// Copyright (C) 2020-2023 Free Software Foundation, Inc.
+
+// This file is part of GCC.
+
+// GCC is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License as published by the Free
+// Software Foundation; either version 3, or (at your option) any later
+// version.
+
+// GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// for more details.
+
+// You should have received a copy of the GNU General Public License
+// along with GCC; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
 #include "rust-system.h"
 #include "optional.h"
 #include "selftest.h"

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2024-01-16 18:06 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-16 18:06 [gcc r14-7912] gccrs: clean up Codepoint and InputSource Arthur Cohen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).