Skip to content

Commit 35b67c3

Browse files
committed
gccrs: Normalize all identifier tokens
gcc/rust/ChangeLog: * lex/rust-lex.cc (assert_source_content): Fix namespace specifier (test_buffer_input_source): Likewise. (test_file_input_source): Likewise. * lex/rust-lex.h: Move InputSource ... * lex/rust-input-source.h: ... to here. (New file) * lex/rust-token.cc (nfc_normalize_token_string): New function * lex/rust-token.h (nfc_normalize_token_string): New function * rust-lang.cc (run_rust_tests): Modify order of selftests. * rust-session-manager.cc (validate_crate_name): Modify interface of Utf8String. * util/rust-unicode.cc (lookup_cc): Modify codepoint_t typedef. (lookup_recomp): Likewise. (recursive_decomp_cano): Likewise. (decomp_cano): Likewise. (sort_cano): Likewise. (compose_hangul): Likewise. (assert_normalize): Likewise. (Utf8String::nfc_normalize): New function. * util/rust-unicode.h: Modify interface of Utf8String. gcc/testsuite/ChangeLog: * rust/compile/unicode_norm1.rs: New test. Signed-off-by: Raiki Tamura <[email protected]>
1 parent f7d9373 commit 35b67c3

10 files changed

+304
-226
lines changed

gcc/rust/lex/rust-input-source.h

+193
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
#ifndef RUST_INPUT_SOURCE_H
2+
#define RUST_INPUT_SOURCE_H
3+
4+
#include "rust-codepoint.h"
5+
#include "optional.h"
6+
7+
namespace Rust {
8+
// Input source wrapper thing.
9+
class InputSource
10+
{
11+
private:
12+
// position of current character
13+
unsigned int pos;
14+
std::vector<Codepoint> chars;
15+
bool is_valid_utf8;
16+
17+
// Overload operator () to return next char from input stream.
18+
virtual int next_byte () = 0;
19+
20+
Codepoint next_codepoint ()
21+
{
22+
uint32_t input = next_byte ();
23+
24+
if ((int32_t) input == EOF)
25+
return Codepoint::eof ();
26+
else if (input < 128)
27+
{
28+
// ascii -- 1 byte
29+
return {input};
30+
}
31+
else if ((input & 0xC0) == 0x80)
32+
{
33+
// invalid (continuation; can't be first char)
34+
return {0xFFFE};
35+
}
36+
else if ((input & 0xE0) == 0xC0)
37+
{
38+
// 2 bytes
39+
uint8_t input2 = next_byte ();
40+
if ((input2 & 0xC0) != 0x80)
41+
return {0xFFFE};
42+
43+
uint32_t output = ((input & 0x1F) << 6) | ((input2 & 0x3F) << 0);
44+
return output;
45+
}
46+
else if ((input & 0xF0) == 0xE0)
47+
{
48+
// 3 bytes or UTF-8 BOM
49+
uint8_t input2 = next_byte ();
50+
// If the second byte is equal to 0xBB then the input is no longer a
51+
// valid UTF-8 char. Then, we check if the third byte makes up a UTF
52+
// BOM.
53+
if (input == 0xEF && input2 == 0xBB)
54+
{
55+
uint8_t input3 = next_byte ();
56+
if (input3 == 0xBF)
57+
// found BOM
58+
return next_codepoint ();
59+
else
60+
return {0xFFFE};
61+
}
62+
63+
if ((input2 & 0xC0) != 0x80)
64+
return {0xFFFE};
65+
66+
uint8_t input3 = next_byte ();
67+
68+
if ((input3 & 0xC0) != 0x80)
69+
return {0xFFFE};
70+
71+
uint32_t output = ((input & 0x0F) << 12) | ((input2 & 0x3F) << 6)
72+
| ((input3 & 0x3F) << 0);
73+
return {output};
74+
}
75+
else if ((input & 0xF8) == 0xF0)
76+
{
77+
// 4 bytes
78+
uint8_t input2 = next_byte ();
79+
if ((input2 & 0xC0) != 0x80)
80+
return {0xFFFE};
81+
82+
uint8_t input3 = next_byte ();
83+
if ((input3 & 0xC0) != 0x80)
84+
return {0xFFFE};
85+
86+
uint8_t input4 = next_byte ();
87+
if ((input4 & 0xC0) != 0x80)
88+
return {0xFFFE};
89+
90+
uint32_t output = ((input & 0x07) << 18) | ((input2 & 0x3F) << 12)
91+
| ((input3 & 0x3F) << 6) | ((input4 & 0x3F) << 0);
92+
return {output};
93+
}
94+
else
95+
{
96+
return {0xFFFE};
97+
}
98+
}
99+
100+
protected:
101+
// Check if the input source is valid as utf-8 and copy all characters to
102+
// `chars`.
103+
void init ()
104+
{
105+
Codepoint char32 = next_codepoint ();
106+
while (!char32.is_eof () && char32 != 0xFFFE)
107+
{
108+
chars.push_back (char32);
109+
char32 = next_codepoint ();
110+
}
111+
112+
if (char32 == 0xFFFE)
113+
{
114+
// Input source is not valid as utf-8.
115+
is_valid_utf8 = false;
116+
}
117+
}
118+
119+
public:
120+
InputSource () : pos (0), chars ({}), is_valid_utf8 (true) {}
121+
122+
virtual ~InputSource () {}
123+
124+
// Checks if input source is a valid UTF-8 string
125+
bool is_valid () { return is_valid_utf8; }
126+
127+
// get the next UTF-8 character
128+
Codepoint next ()
129+
{
130+
if (pos >= chars.size ())
131+
return Codepoint::eof ();
132+
else
133+
{
134+
Codepoint c = chars[pos];
135+
pos++;
136+
return c;
137+
}
138+
}
139+
140+
// Returns codepoint if input source is a valid UTF-8 string. Returns
141+
// nullopt otherwise.
142+
tl::optional<std::vector<Codepoint>> get_chars ()
143+
{
144+
if (is_valid ())
145+
return {chars};
146+
else
147+
return tl::nullopt;
148+
}
149+
};
150+
151+
class FileInputSource : public InputSource
152+
{
153+
private:
154+
// Input source file.
155+
FILE *input;
156+
157+
int next_byte () override { return fgetc (input); }
158+
159+
public:
160+
// Create new input source from file.
161+
FileInputSource (FILE *input) : InputSource (), input (input)
162+
{
163+
// TODO make this better?
164+
init ();
165+
}
166+
};
167+
168+
class BufferInputSource : public InputSource
169+
{
170+
private:
171+
const std::string &buffer;
172+
size_t offs;
173+
174+
int next_byte () override
175+
{
176+
if (offs >= buffer.size ())
177+
return EOF;
178+
return (uint8_t) buffer.at (offs++);
179+
}
180+
181+
public:
182+
// Create new input source from file.
183+
BufferInputSource (const std::string &b, size_t offset)
184+
: InputSource (), buffer (b), offs (offset)
185+
{
186+
// TODO make this better?
187+
init ();
188+
}
189+
};
190+
191+
} // namespace Rust
192+
193+
#endif

gcc/rust/lex/rust-lex.cc

+3-4
Original file line numberDiff line numberDiff line change
@@ -2534,8 +2534,7 @@ namespace selftest {
25342534

25352535
// Checks if `src` has the same contents as the given characters
25362536
void
2537-
assert_source_content (Rust::Lexer::InputSource &src,
2538-
std::vector<uint32_t> expected)
2537+
assert_source_content (Rust::InputSource &src, std::vector<uint32_t> expected)
25392538
{
25402539
Rust::Codepoint src_char = src.next ();
25412540
for (auto expected_char : expected)
@@ -2553,7 +2552,7 @@ assert_source_content (Rust::Lexer::InputSource &src,
25532552
void
25542553
test_buffer_input_source (std::string str, std::vector<uint32_t> expected)
25552554
{
2556-
Rust::Lexer::BufferInputSource source (str, 0);
2555+
Rust::BufferInputSource source (str, 0);
25572556
assert_source_content (source, expected);
25582557
}
25592558

@@ -2564,7 +2563,7 @@ test_file_input_source (std::string str, std::vector<uint32_t> expected)
25642563
// Moves to the first character
25652564
fputs (str.c_str (), tmpf);
25662565
std::rewind (tmpf);
2567-
Rust::Lexer::FileInputSource source (tmpf);
2566+
Rust::FileInputSource source (tmpf);
25682567
assert_source_content (source, expected);
25692568
}
25702569

0 commit comments

Comments
 (0)