Rust-GCC · philberty · Jul 30, 2023 · Jul 14, 2023 · tamaroning · Jul 21, 2023
diff --git a/gcc/rust/lex/rust-codepoint.h b/gcc/rust/lex/rust-codepoint.h
@@ -22,6 +22,8 @@
 #include "rust-system.h"
 
 namespace Rust {
+
+// FIXME: move this to rust-unicode.h?
 struct Codepoint
 {
   uint32_t value;

diff --git a/gcc/rust/lex/rust-lex.h b/gcc/rust/lex/rust-lex.h
@@ -334,6 +334,14 @@ class Lexer
 	  return c;
 	}
     }
+
+    tl::optional<std::vector<Codepoint>> get_chars ()
+    {
+      if (is_valid ())
+	return {chars};
+      else
+	return tl::nullopt;
+    }
   };
 
   class FileInputSource : public InputSource

diff --git a/gcc/rust/rust-session-manager.cc b/gcc/rust/rust-session-manager.cc
@@ -42,6 +42,7 @@
 #include "rust-early-name-resolver.h"
 #include "rust-cfg-strip.h"
 #include "rust-expand-visitor.h"
+#include "rust-unicode.h"
 
 #include "diagnostic.h"
 #include "input.h"
@@ -107,30 +108,39 @@ infer_crate_name (const std::string &filename)
   return crate;
 }
 
-/* Validate the crate name using the ASCII rules
-   TODO: Support Unicode version of the rules */
+/* Validate the crate name using the ASCII rules */
 
 static bool
 validate_crate_name (const std::string &crate_name, Error &error)
 {
-  if (crate_name.empty ())
+  Utf8String utf8_name = {crate_name};
+  tl::optional<std::vector<Codepoint>> uchars_opt = utf8_name.get_chars ();
+
+  if (!uchars_opt.has_value ())
+    {
+      error = Error (UNDEF_LOCATION, "crate name is not a valid UTF-8 string");
+      return false;
+    }
-    }
+    }
+auto uchars = uchars_opt.value();
-    }
+    }
+auto uchars = uchars_opt.value();
+
+  std::vector<Codepoint> uchars = uchars_opt.value ();
+  if (uchars.empty ())
     {
       error = Error (UNDEF_LOCATION, "crate name cannot be empty");
       return false;
     }
-  if (crate_name.length () > kMaxNameLength)
+  if (uchars.size () > kMaxNameLength)
     {
       error = Error (UNDEF_LOCATION, "crate name cannot exceed %lu characters",
 		     (unsigned long) kMaxNameLength);
       return false;
     }
-  for (auto &c : crate_name)
+  for (Codepoint &c : uchars)
     {
-      if (!(ISALNUM (c) || c == '_'))
+      if (!(is_alphabetic (c.value) || is_numeric (c.value) || c.value == '_'))
 	{
 	  error = Error (UNDEF_LOCATION,
-			 "invalid character %<%c%> in crate name: %<%s%>", c,
-			 crate_name.c_str ());
+			 "invalid character %<%s%> in crate name: %<%s%>",
+			 c.as_string ().c_str (), crate_name.c_str ());
 	  return false;
 	}
     }
@@ -1273,13 +1283,17 @@ rust_crate_name_validation_test (void)
   ASSERT_TRUE (Rust::validate_crate_name ("example", error));
   ASSERT_TRUE (Rust::validate_crate_name ("abcdefg_1234", error));
   ASSERT_TRUE (Rust::validate_crate_name ("1", error));
-  // FIXME: The next test does not pass as of current implementation
-  // ASSERT_TRUE (Rust::CompileOptions::validate_crate_name ("惊吓"));
+  ASSERT_TRUE (Rust::validate_crate_name ("クレート", error));
+  ASSERT_TRUE (Rust::validate_crate_name ("Sōkrátēs", error));
+  ASSERT_TRUE (Rust::validate_crate_name ("惊吓", error));
+
   // NOTE: - is not allowed in the crate name ...
 
   ASSERT_FALSE (Rust::validate_crate_name ("abcdefg-1234", error));
   ASSERT_FALSE (Rust::validate_crate_name ("a+b", error));
   ASSERT_FALSE (Rust::validate_crate_name ("/a+b/", error));
+  ASSERT_FALSE (Rust::validate_crate_name ("😸++", error));
+  ASSERT_FALSE (Rust::validate_crate_name ("∀", error));
 
   /* Tests for crate name inference */
   ASSERT_EQ (Rust::infer_crate_name ("c.rs"), "c");

diff --git a/gcc/rust/util/rust-unicode.cc b/gcc/rust/util/rust-unicode.cc
@@ -12,6 +12,7 @@ typedef std::vector<codepoint_t> string_t;
 template <std::size_t SIZE>
 int64_t
 binary_search_ranges (
+  // FIXME: use binray search function from <algorithm>
   const std::array<std::pair<uint32_t, uint32_t>, SIZE> &ranges,
   uint32_t target_cp)
 {
@@ -49,6 +50,7 @@ int64_t
 binary_search_sorted_array (const std::array<std::uint32_t, SIZE> &array,
 			    uint32_t target)
 {
+  // FIXME: use binray search function from <algorithm>
   if (SIZE == 0)
     return -1;
 
@@ -104,9 +106,7 @@ recursive_decomp_cano (codepoint_t c, string_t &buf)
     {
       string_t decomped = it->second;
       for (codepoint_t cp : decomped)
-	{
-	  recursive_decomp_cano (cp, buf);
-	}
+	recursive_decomp_cano (cp, buf);
     }
   else
     buf.push_back (c);
@@ -152,8 +152,7 @@ recomp (string_t s)
   if (s.size () > 0)
     {
       int last_class = -1;
-      // int starter_pos = 0; // Assume the first character is Starter. Correct?
-      // int target_pos = 1;
+      // Assume the first character is Starter.
       codepoint_t starter_ch = s[0];
       for (unsigned int src_pos = 1; src_pos < s.size (); src_pos++)
 	{
@@ -189,20 +188,6 @@ recomp (string_t s)
   return buf;
 }
 
-// TODO: remove
-/*
-void
-dump_string (std::vector<uint32_t> s)
-{
-  std::cout << "dump=";
-  for (auto c : s)
-    {
-      std::cout << std::hex << c << ", ";
-    }
-  std::cout << std::endl;
-}
-*/
-
 string_t
 nfc_normalize (string_t s)
 {

diff --git a/gcc/rust/util/rust-unicode.h b/gcc/rust/util/rust-unicode.h
@@ -19,10 +19,29 @@
 #ifndef RUST_UNICODE_H
 #define RUST_UNICODE_H
 
+#include "optional.h"
 #include "rust-system.h"
+#include "rust-lex.h"
 
 namespace Rust {
 
+class Utf8String
+{
+private:
+  tl::optional<std::vector<Codepoint>> chars;
+
+public:
+  Utf8String (const std::string &maybe_utf8)
+  {
+    Lexer::BufferInputSource input_source = {maybe_utf8, 0};
+    chars = input_source.get_chars ();
+  }
+
+  // Returns UTF codepoints when string is valid as UTF-8, returns nullopt
+  // otherwise.
+  tl::optional<std::vector<Codepoint>> get_chars () const { return chars; }
+};
+
 // TODO: add function nfc_normalize
 
 bool

diff --git a/gcc/testsuite/rust/compile/bad-crate-name.rs → ...testsuite/rust/compile/bad-crate-name1.rs b/gcc/testsuite/rust/compile/bad-crate-name.rs → ...testsuite/rust/compile/bad-crate-name1.rs
diff --git a/gcc/testsuite/rust/compile/bad-crate-name2.rs b/gcc/testsuite/rust/compile/bad-crate-name2.rs
@@ -0,0 +1,2 @@
+#![crate_name = "😅"] // { dg-error "invalid character ...." "" }
+fn main() {}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		#![crate_name = "😅"] // { dg-error "invalid character ...." "" }
		fn main() {}