public inbox for gcc-cvs@sourceware.org
help / color / mirror / Atom feed
* [gcc r14-7868] gccrs: Add Unicode check for crate_name attributes
@ 2024-01-16 18:05 Arthur Cohen
  0 siblings, 0 replies; only message in thread
From: Arthur Cohen @ 2024-01-16 18:05 UTC (permalink / raw)
  To: gcc-cvs

https://gcc.gnu.org/g:f7b2e17682b5139a08f7956226bf7ccbdec88230

commit r14-7868-gf7b2e17682b5139a08f7956226bf7ccbdec88230
Author: Raiki Tamura <tamaron1203@gmail.com>
Date:   Fri Jul 14 14:45:34 2023 +0900

    gccrs: Add Unicode check for crate_name attributes
    
    gcc/rust/ChangeLog:
    
            * lex/rust-codepoint.h: Add comment
            * lex/rust-lex.h: New method to get decoded characters
            * rust-session-manager.cc (validate_crate_name): Modify unicode check
            (rust_crate_name_validation_test): Add testcases
            * util/rust-unicode.h (RUST_UNICODE_H): New class Utf8String.
            (class Utf8String): New class.
            * util/rust-unicode.cc (binary_search_sorted_array): Add comment.
            (recursive_decomp_cano): Add comment.
            (recomp): Remove dead code.
            (dump_string): Removed.
    
    gcc/testsuite/ChangeLog:
    
            * rust/compile/bad-crate-name.rs: Moved to...
            * rust/compile/bad-crate-name1.rs: ...here.
            * rust/compile/bad-crate-name2.rs: New test.
    
    Signed-off-by: Raiki Tamura <tamaron1203@gmail.com>

Diff:
---
 gcc/rust/lex/rust-codepoint.h                      |  2 ++
 gcc/rust/lex/rust-lex.h                            |  8 +++++
 gcc/rust/rust-session-manager.cc                   | 34 +++++++++++++++-------
 gcc/rust/util/rust-unicode.cc                      | 23 +++------------
 gcc/rust/util/rust-unicode.h                       | 19 ++++++++++++
 .../{bad-crate-name.rs => bad-crate-name1.rs}      |  0
 gcc/testsuite/rust/compile/bad-crate-name2.rs      |  2 ++
 7 files changed, 59 insertions(+), 29 deletions(-)

diff --git a/gcc/rust/lex/rust-codepoint.h b/gcc/rust/lex/rust-codepoint.h
index e2d05718f30..755c8373383 100644
--- a/gcc/rust/lex/rust-codepoint.h
+++ b/gcc/rust/lex/rust-codepoint.h
@@ -22,6 +22,8 @@
 #include "rust-system.h"
 
 namespace Rust {
+
+// FIXME: move this to rust-unicode.h?
 struct Codepoint
 {
   uint32_t value;
diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h
index 27286ac2877..91e814b76f3 100644
--- a/gcc/rust/lex/rust-lex.h
+++ b/gcc/rust/lex/rust-lex.h
@@ -334,6 +334,14 @@ public:
 	  return c;
 	}
     }
+
+    tl::optional<std::vector<Codepoint>> get_chars ()
+    {
+      if (is_valid ())
+	return {chars};
+      else
+	return tl::nullopt;
+    }
   };
 
   class FileInputSource : public InputSource
diff --git a/gcc/rust/rust-session-manager.cc b/gcc/rust/rust-session-manager.cc
index 4f779d17077..3461198be95 100644
--- a/gcc/rust/rust-session-manager.cc
+++ b/gcc/rust/rust-session-manager.cc
@@ -42,6 +42,7 @@
 #include "rust-early-name-resolver.h"
 #include "rust-cfg-strip.h"
 #include "rust-expand-visitor.h"
+#include "rust-unicode.h"
 
 #include "diagnostic.h"
 #include "input.h"
@@ -107,30 +108,39 @@ infer_crate_name (const std::string &filename)
   return crate;
 }
 
-/* Validate the crate name using the ASCII rules
-   TODO: Support Unicode version of the rules */
+/* Validate the crate name using the ASCII rules */
 
 static bool
 validate_crate_name (const std::string &crate_name, Error &error)
 {
-  if (crate_name.empty ())
+  Utf8String utf8_name = {crate_name};
+  tl::optional<std::vector<Codepoint>> uchars_opt = utf8_name.get_chars ();
+
+  if (!uchars_opt.has_value ())
+    {
+      error = Error (UNDEF_LOCATION, "crate name is not a valid UTF-8 string");
+      return false;
+    }
+
+  std::vector<Codepoint> uchars = uchars_opt.value ();
+  if (uchars.empty ())
     {
       error = Error (UNDEF_LOCATION, "crate name cannot be empty");
       return false;
     }
-  if (crate_name.length () > kMaxNameLength)
+  if (uchars.size () > kMaxNameLength)
     {
       error = Error (UNDEF_LOCATION, "crate name cannot exceed %lu characters",
 		     (unsigned long) kMaxNameLength);
       return false;
     }
-  for (auto &c : crate_name)
+  for (Codepoint &c : uchars)
     {
-      if (!(ISALNUM (c) || c == '_'))
+      if (!(is_alphabetic (c.value) || is_numeric (c.value) || c.value == '_'))
 	{
 	  error = Error (UNDEF_LOCATION,
-			 "invalid character %<%c%> in crate name: %<%s%>", c,
-			 crate_name.c_str ());
+			 "invalid character %<%s%> in crate name: %<%s%>",
+			 c.as_string ().c_str (), crate_name.c_str ());
 	  return false;
 	}
     }
@@ -1273,13 +1283,17 @@ rust_crate_name_validation_test (void)
   ASSERT_TRUE (Rust::validate_crate_name ("example", error));
   ASSERT_TRUE (Rust::validate_crate_name ("abcdefg_1234", error));
   ASSERT_TRUE (Rust::validate_crate_name ("1", error));
-  // FIXME: The next test does not pass as of current implementation
-  // ASSERT_TRUE (Rust::CompileOptions::validate_crate_name ("惊吓"));
+  ASSERT_TRUE (Rust::validate_crate_name ("クレート", error));
+  ASSERT_TRUE (Rust::validate_crate_name ("Sōkrátēs", error));
+  ASSERT_TRUE (Rust::validate_crate_name ("惊吓", error));
+
   // NOTE: - is not allowed in the crate name ...
 
   ASSERT_FALSE (Rust::validate_crate_name ("abcdefg-1234", error));
   ASSERT_FALSE (Rust::validate_crate_name ("a+b", error));
   ASSERT_FALSE (Rust::validate_crate_name ("/a+b/", error));
+  ASSERT_FALSE (Rust::validate_crate_name ("😸++", error));
+  ASSERT_FALSE (Rust::validate_crate_name ("∀", error));
 
   /* Tests for crate name inference */
   ASSERT_EQ (Rust::infer_crate_name ("c.rs"), "c");
diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc
index 738e1f162eb..73e1abd9980 100644
--- a/gcc/rust/util/rust-unicode.cc
+++ b/gcc/rust/util/rust-unicode.cc
@@ -12,6 +12,7 @@ typedef std::vector<codepoint_t> string_t;
 template <std::size_t SIZE>
 int64_t
 binary_search_ranges (
+  // FIXME: use binray search function from <algorithm>
   const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
   uint32_t target_cp)
 {
@@ -49,6 +50,7 @@ int64_t
 binary_search_sorted_array (const std::array<uint32_t, SIZE> &array,
 			    uint32_t target)
 {
+  // FIXME: use binray search function from <algorithm>
   if (SIZE == 0)
     return -1;
 
@@ -104,9 +106,7 @@ recursive_decomp_cano (codepoint_t c, string_t &buf)
     {
       string_t decomped = it->second;
       for (codepoint_t cp : decomped)
-	{
-	  recursive_decomp_cano (cp, buf);
-	}
+	recursive_decomp_cano (cp, buf);
     }
   else
     buf.push_back (c);
@@ -152,8 +152,7 @@ recomp (string_t s)
   if (s.size () > 0)
     {
       int last_class = -1;
-      // int starter_pos = 0; // Assume the first character is Starter. Correct?
-      // int target_pos = 1;
+      // Assume the first character is Starter.
       codepoint_t starter_ch = s[0];
       for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
 	{
@@ -189,20 +188,6 @@ recomp (string_t s)
   return buf;
 }
 
-// TODO: remove
-/*
-void
-dump_string (std::vector<uint32_t> s)
-{
-  std::cout << "dump=";
-  for (auto c : s)
-    {
-      std::cout << std::hex << c << ", ";
-    }
-  std::cout << std::endl;
-}
-*/
-
 string_t
 nfc_normalize (string_t s)
 {
diff --git a/gcc/rust/util/rust-unicode.h b/gcc/rust/util/rust-unicode.h
index 8c0bd0656c4..68005587d0a 100644
--- a/gcc/rust/util/rust-unicode.h
+++ b/gcc/rust/util/rust-unicode.h
@@ -19,10 +19,29 @@
 #ifndef RUST_UNICODE_H
 #define RUST_UNICODE_H
 
+#include "optional.h"
 #include "rust-system.h"
+#include "rust-lex.h"
 
 namespace Rust {
 
+class Utf8String
+{
+private:
+  tl::optional<std::vector<Codepoint>> chars;
+
+public:
+  Utf8String (const std::string &maybe_utf8)
+  {
+    Lexer::BufferInputSource input_source = {maybe_utf8, 0};
+    chars = input_source.get_chars ();
+  }
+
+  // Returns UTF codepoints when string is valid as UTF-8, returns nullopt
+  // otherwise.
+  tl::optional<std::vector<Codepoint>> get_chars () const { return chars; }
+};
+
 // TODO: add function nfc_normalize
 
 bool
diff --git a/gcc/testsuite/rust/compile/bad-crate-name.rs b/gcc/testsuite/rust/compile/bad-crate-name1.rs
similarity index 100%
rename from gcc/testsuite/rust/compile/bad-crate-name.rs
rename to gcc/testsuite/rust/compile/bad-crate-name1.rs
diff --git a/gcc/testsuite/rust/compile/bad-crate-name2.rs b/gcc/testsuite/rust/compile/bad-crate-name2.rs
new file mode 100644
index 00000000000..1d80fa5978d
--- /dev/null
+++ b/gcc/testsuite/rust/compile/bad-crate-name2.rs
@@ -0,0 +1,2 @@
+#![crate_name = "😅"] // { dg-error "invalid character ...." "" }
+fn main() {}

^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2024-01-16 18:05 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2024-01-16 18:05 [gcc r14-7868] gccrs: Add Unicode check for crate_name attributes Arthur Cohen

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).