Skip to content

Normalize all identifiers to NFC #2489

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
193 changes: 193 additions & 0 deletions gcc/rust/lex/rust-input-source.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,193 @@
#ifndef RUST_INPUT_SOURCE_H
#define RUST_INPUT_SOURCE_H

#include "rust-codepoint.h"
#include "optional.h"

namespace Rust {
// Input source wrapper thing.
class InputSource
{
private:
// position of current character
unsigned int pos;
std::vector<Codepoint> chars;
bool is_valid_utf8;

// Overload operator () to return next char from input stream.
virtual int next_byte () = 0;

Codepoint next_codepoint ()
{
uint32_t input = next_byte ();

if ((int32_t) input == EOF)
return Codepoint::eof ();
else if (input < 128)
{
// ascii -- 1 byte
return {input};
}
else if ((input & 0xC0) == 0x80)
{
// invalid (continuation; can't be first char)
return {0xFFFE};
}
else if ((input & 0xE0) == 0xC0)
{
// 2 bytes
uint8_t input2 = next_byte ();
if ((input2 & 0xC0) != 0x80)
return {0xFFFE};

uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
return output;
}
else if ((input & 0xF0) == 0xE0)
{
// 3 bytes or UTF-8 BOM
uint8_t input2 = next_byte ();
// If the second byte is equal to 0xBB then the input is no longer a
// valid UTF-8 char. Then, we check if the third byte makes up a UTF
// BOM.
if (input == 0xEF && input2 == 0xBB)
{
uint8_t input3 = next_byte ();
if (input3 == 0xBF)
// found BOM
return next_codepoint ();
else
return {0xFFFE};
}

if ((input2 & 0xC0) != 0x80)
return {0xFFFE};

uint8_t input3 = next_byte ();

if ((input3 & 0xC0) != 0x80)
return {0xFFFE};

uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
| ((input3 & 0x3F) << 0);
return {output};
}
else if ((input & 0xF8) == 0xF0)
{
// 4 bytes
uint8_t input2 = next_byte ();
if ((input2 & 0xC0) != 0x80)
return {0xFFFE};

uint8_t input3 = next_byte ();
if ((input3 & 0xC0) != 0x80)
return {0xFFFE};

uint8_t input4 = next_byte ();
if ((input4 & 0xC0) != 0x80)
return {0xFFFE};

uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
return {output};
}
else
{
return {0xFFFE};
}
}

protected:
// Check if the input source is valid as utf-8 and copy all characters to
// `chars`.
void init ()
{
Codepoint char32 = next_codepoint ();
while (!char32.is_eof () && char32 != 0xFFFE)
{
chars.push_back (char32);
char32 = next_codepoint ();
}

if (char32 == 0xFFFE)
{
// Input source is not valid as utf-8.
is_valid_utf8 = false;
}
}

public:
InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}

virtual ~InputSource () {}

// Checks if input source is a valid UTF-8 string
bool is_valid () { return is_valid_utf8; }

// get the next UTF-8 character
Codepoint next ()
{
if (pos >= chars.size ())
return Codepoint::eof ();
else
{
Codepoint c = chars[pos];
pos++;
return c;
}
}

// Returns codepoint if input source is a valid UTF-8 string. Returns
// nullopt otherwise.
tl::optional<std::vector<Codepoint>> get_chars ()
{
if (is_valid ())
return {chars};
else
return tl::nullopt;
}
};

class FileInputSource : public InputSource
{
private:
// Input source file.
FILE *input;

int next_byte () override { return fgetc (input); }

public:
// Create new input source from file.
FileInputSource (FILE *input) : InputSource (), input (input)
{
// TODO make this better?
init ();
}
};

class BufferInputSource : public InputSource
{
private:
const std::string &buffer;
size_t offs;

int next_byte () override
{
if (offs >= buffer.size ())
return EOF;
return (uint8_t) buffer.at (offs++);
}

public:
// Create new input source from file.
BufferInputSource (const std::string &b, size_t offset)
: InputSource (), buffer (b), offs (offset)
{
// TODO make this better?
init ();
}
};

} // namespace Rust

#endif
7 changes: 3 additions & 4 deletions gcc/rust/lex/rust-lex.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2534,8 +2534,7 @@ namespace selftest {

// Checks if `src` has the same contents as the given characters
void
assert_source_content (Rust::Lexer::InputSource &src,
std::vector<uint32_t> expected)
assert_source_content (Rust::InputSource &src, std::vector<uint32_t> expected)
{
Rust::Codepoint src_char = src.next ();
for (auto expected_char : expected)
Expand All @@ -2553,7 +2552,7 @@ assert_source_content (Rust::Lexer::InputSource &src,
void
test_buffer_input_source (std::string str, std::vector<uint32_t> expected)
{
Rust::Lexer::BufferInputSource source (str, 0);
Rust::BufferInputSource source (str, 0);
assert_source_content (source, expected);
}

Expand All @@ -2564,7 +2563,7 @@ test_file_input_source (std::string str, std::vector<uint32_t> expected)
// Moves to the first character
fputs (str.c_str (), tmpf);
std::rewind (tmpf);
Rust::Lexer::FileInputSource source (tmpf);
Rust::FileInputSource source (tmpf);
assert_source_content (source, expected);
}

Expand Down
Loading