Skip to content

Add punycode encoding to v0 mangling #2535

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 38 additions & 15 deletions gcc/rust/backend/rust-mangle.cc
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
#include "rust-mangle.h"
#include "fnv-hash.h"
#include "optional.h"
#include "rust-base62.h"
#include "rust-unicode.h"
#include "optional.h"
#include "rust-diagnostics.h"
#include "rust-unicode.h"
#include "rust-punycode.h"

// FIXME: Rename those to legacy_*
static const std::string kMangledSymbolPrefix = "_ZN";
Expand Down Expand Up @@ -249,22 +252,42 @@ v0_add_disambiguator (std::string &mangled, uint64_t dis)
static void
v0_add_identifier (std::string &mangled, const std::string &identifier)
{
// FIXME: gccrs cannot handle unicode identifiers yet, so we never have to
// create mangling for unicode values for now. However, this is handled
// by the v0 mangling scheme. The grammar for unicode identifier is
// contained in <undisambiguated-identifier>, right under the <identifier>
// one. If the identifier contains unicode values, then an extra "u" needs
// to be added to the mangling string and `punycode` must be used to encode
// the characters.

mangled += std::to_string (identifier.size ());

// The grammar for unicode identifier is contained in
// <undisambiguated-identifier>, right under the <identifier> one. If the
// identifier contains unicode values, then an extra "u" needs to be added to
// the mangling string and `punycode` must be used to encode the characters.
tl::optional<Utf8String> uident_opt
= Utf8String::make_utf8_string (identifier);
rust_assert (uident_opt.has_value ());
tl::optional<std::string> punycode_opt
= encode_punycode (uident_opt.value ());
rust_assert (punycode_opt.has_value ());

bool is_ascii_ident = true;
for (auto c : uident_opt.value ().get_chars ())
if (c.value > 127)
{
is_ascii_ident = false;
break;
}

std::string punycode = punycode_opt.value ();
// remove tailing hyphen
if (punycode.back () == '-')
punycode.pop_back ();
// replace hyphens in punycode with underscores
std::replace (punycode.begin (), punycode.end (), '-', '_');

if (!is_ascii_ident)
mangled.append ("u");

mangled += std::to_string (punycode.size ());
// If the first character of the identifier is a digit or an underscore, we
// add an extra underscore
if (identifier[0] == '_')
mangled.append ("_");
if (punycode[0] == '_')
mangled += "_";

mangled.append (identifier);
mangled += punycode;
}

static std::string
Expand Down Expand Up @@ -300,9 +323,9 @@ v0_mangle_item (const TyTy::BaseType *ty, const Resolver::CanonicalPath &path)

std::string mangled;
// FIXME: Add real algorithm once all pieces are implemented
auto ty_prefix = v0_type_prefix (ty);
v0_add_identifier (mangled, crate_name);
v0_add_disambiguator (mangled, 62);
auto ty_prefix = v0_type_prefix (ty);

rust_unreachable ();
}
Expand Down
14 changes: 8 additions & 6 deletions gcc/rust/lex/rust-lex.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2533,8 +2533,9 @@ Lexer::start_line (int current_line, int current_column)
namespace selftest {

// Checks if `src` has the same contents as the given characters
void
assert_source_content (Rust::InputSource &src, std::vector<uint32_t> expected)
static void
assert_source_content (Rust::InputSource &src,
const std::vector<uint32_t> &expected)
{
Rust::Codepoint src_char = src.next ();
for (auto expected_char : expected)
Expand All @@ -2549,15 +2550,16 @@ assert_source_content (Rust::InputSource &src, std::vector<uint32_t> expected)
ASSERT_TRUE (src_char.is_eof ());
}

void
test_buffer_input_source (std::string str, std::vector<uint32_t> expected)
static void
test_buffer_input_source (std::string str,
const std::vector<uint32_t> &expected)
{
Rust::BufferInputSource source (str, 0);
assert_source_content (source, expected);
}

void
test_file_input_source (std::string str, std::vector<uint32_t> expected)
static void
test_file_input_source (std::string str, const std::vector<uint32_t> &expected)
{
FILE *tmpf = tmpfile ();
// Moves to the first character
Expand Down
2 changes: 1 addition & 1 deletion gcc/rust/resolve/rust-ast-resolve-toplevel.h
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ class ResolveTopLevel : public ResolverBase
}
else
{
CrateNum found_crate_num = UNKNOWN_CREATENUM;
CrateNum found_crate_num = UNKNOWN_CRATENUM;
bool found
= mappings->lookup_crate_name (extern_crate.get_referenced_crate (),
found_crate_num);
Expand Down
2 changes: 1 addition & 1 deletion gcc/rust/rust-session-manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -979,7 +979,7 @@ NodeId
Session::load_extern_crate (const std::string &crate_name, location_t locus)
{
// has it already been loaded?
CrateNum found_crate_num = UNKNOWN_CREATENUM;
CrateNum found_crate_num = UNKNOWN_CRATENUM;
bool found = mappings->lookup_crate_name (crate_name, found_crate_num);
if (found)
{
Expand Down
6 changes: 3 additions & 3 deletions gcc/rust/util/rust-canonical-path.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class CanonicalPath
{
rust_assert (!path.empty ());
return CanonicalPath ({std::pair<NodeId, std::string> (id, path)},
UNKNOWN_CREATENUM);
UNKNOWN_CRATENUM);
}

static CanonicalPath
Expand Down Expand Up @@ -88,7 +88,7 @@ class CanonicalPath

static CanonicalPath create_empty ()
{
return CanonicalPath ({}, UNKNOWN_CREATENUM);
return CanonicalPath ({}, UNKNOWN_CRATENUM);
}

bool is_empty () const { return segs.size () == 0; }
Expand Down Expand Up @@ -171,7 +171,7 @@ class CanonicalPath

CrateNum get_crate_num () const
{
rust_assert (crate_num != UNKNOWN_CREATENUM);
rust_assert (crate_num != UNKNOWN_CRATENUM);
return crate_num;
}

Expand Down
4 changes: 2 additions & 2 deletions gcc/rust/util/rust-hir-map.cc
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ namespace Analysis {
NodeMapping
NodeMapping::get_error ()
{
return NodeMapping (UNKNOWN_CREATENUM, UNKNOWN_NODEID, UNKNOWN_HIRID,
return NodeMapping (UNKNOWN_CRATENUM, UNKNOWN_NODEID, UNKNOWN_HIRID,
UNKNOWN_LOCAL_DEFID);
}

Expand Down Expand Up @@ -94,7 +94,7 @@ static const HirId kDefaultHirIdBegin = 1;
static const HirId kDefaultCrateNumBegin = 0;

Mappings::Mappings ()
: crateNumItr (kDefaultCrateNumBegin), currentCrateNum (UNKNOWN_CREATENUM),
: crateNumItr (kDefaultCrateNumBegin), currentCrateNum (UNKNOWN_CRATENUM),
hirIdIter (kDefaultHirIdBegin), nodeIdIter (kDefaultNodeIdBegin)
{
Analysis::NodeMapping node (0, 0, 0, 0);
Expand Down
2 changes: 1 addition & 1 deletion gcc/rust/util/rust-mapping-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ struct DefId
}
};

#define UNKNOWN_CREATENUM ((uint32_t) (0))
#define UNKNOWN_CRATENUM ((uint32_t) (UINT32_MAX))
#define UNKNOWN_NODEID ((uint32_t) (0))
Comment on lines 63 to 65
Copy link
Contributor Author

@tamaroning tamaroning Aug 7, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed this value because CrateNumItr is initialized to 0, which means a crate compiled first is treated as UNKNOWN_CRATENUM.
ref: #894

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not sure but the same might be also true for UNKNOWN_NODEID, UNKNOWN_HIRID, etc.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed this value because CrateNumItr is initialized to 0, which means a crate compiled first is treated as UNKNOWN_CRATENUM. ref: #894

I'm not a fan of putting special values, they should be treated as a completely separate thing (ehehe, rust habits I guess). For now I think this seems mostly ok. At least I can't think of a situation where it break in a realistic scenario. But in the future we might need to handle this properly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CrateNum field was added to CanonicalPath for the v0 mangling because the field is used as disambiguator in demangled symbols.
Actually CanonicalPath::get_crate_num internally calls the following assert and fails here:
https://github.com/Rust-GCC/gccrs/blob/e55113ea2bf0cec2f8436a576ce6f2bcaacd1c27/gcc/rust/util/rust-canonical-path.h#L174C50-L174C50
So I have had to modify this value to avoid this assert failure.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here thats a good idea @tamaroning raise an issue to do the same with the other ID's

#define UNKNOWN_HIRID ((uint32_t) (0))
#define UNKNOWN_LOCAL_DEFID ((uint32_t) (0))
Expand Down