Skip to content

Commit 9d85335

Browse files
authored
Merge pull request #12427 from r00ster91/nicelexer
tokenizer: cleanups
2 parents 6e79493 + 8390965 commit 9d85335

File tree

1 file changed

+39
-43
lines changed

1 file changed

+39
-43
lines changed

lib/std/zig/tokenizer.zig

Lines changed: 39 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
const std = @import("../std.zig");
2-
const mem = std.mem;
32

43
pub const Token = struct {
54
tag: Tag,
@@ -350,7 +349,7 @@ pub const Tokenizer = struct {
350349

351350
pub fn init(buffer: [:0]const u8) Tokenizer {
352351
// Skip the UTF-8 BOM if present
353-
const src_start = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else @as(usize, 0);
352+
const src_start: usize = if (std.mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else 0;
354353
return Tokenizer{
355354
.buffer = buffer,
356355
.index = src_start,
@@ -1433,8 +1432,8 @@ pub const Tokenizer = struct {
14331432

14341433
fn getInvalidCharacterLength(self: *Tokenizer) u3 {
14351434
const c0 = self.buffer[self.index];
1436-
if (c0 < 0x80) {
1437-
if (c0 < 0x20 or c0 == 0x7f) {
1435+
if (std.ascii.isASCII(c0)) {
1436+
if (std.ascii.isCntrl(c0)) {
14381437
// ascii control codes are never allowed
14391438
// (note that \n was checked before we got here)
14401439
return 1;
@@ -1469,8 +1468,8 @@ pub const Tokenizer = struct {
14691468
}
14701469
};
14711470

1472-
test "tokenizer" {
1473-
try testTokenize("test", &.{.keyword_test});
1471+
test "keywords" {
1472+
try testTokenize("test const else", &.{ .keyword_test, .keyword_const, .keyword_else });
14741473
}
14751474

14761475
test "line comment followed by top-level comptime" {
@@ -1485,7 +1484,7 @@ test "line comment followed by top-level comptime" {
14851484
});
14861485
}
14871486

1488-
test "tokenizer - unknown length pointer and then c pointer" {
1487+
test "unknown length pointer and then c pointer" {
14891488
try testTokenize(
14901489
\\[*]u8
14911490
\\[*c]u8
@@ -1502,7 +1501,7 @@ test "tokenizer - unknown length pointer and then c pointer" {
15021501
});
15031502
}
15041503

1505-
test "tokenizer - code point literal with hex escape" {
1504+
test "code point literal with hex escape" {
15061505
try testTokenize(
15071506
\\'\x1b'
15081507
, &.{.char_literal});
@@ -1511,21 +1510,21 @@ test "tokenizer - code point literal with hex escape" {
15111510
, &.{ .invalid, .invalid });
15121511
}
15131512

1514-
test "tokenizer - newline in char literal" {
1513+
test "newline in char literal" {
15151514
try testTokenize(
15161515
\\'
15171516
\\'
15181517
, &.{ .invalid, .invalid });
15191518
}
15201519

1521-
test "tokenizer - newline in string literal" {
1520+
test "newline in string literal" {
15221521
try testTokenize(
15231522
\\"
15241523
\\"
15251524
, &.{ .invalid, .string_literal });
15261525
}
15271526

1528-
test "tokenizer - code point literal with unicode escapes" {
1527+
test "code point literal with unicode escapes" {
15291528
// Valid unicode escapes
15301529
try testTokenize(
15311530
\\'\u{3}'
@@ -1575,13 +1574,13 @@ test "tokenizer - code point literal with unicode escapes" {
15751574
, &.{ .invalid, .integer_literal, .invalid });
15761575
}
15771576

1578-
test "tokenizer - code point literal with unicode code point" {
1577+
test "code point literal with unicode code point" {
15791578
try testTokenize(
15801579
\\'💩'
15811580
, &.{.char_literal});
15821581
}
15831582

1584-
test "tokenizer - float literal e exponent" {
1583+
test "float literal e exponent" {
15851584
try testTokenize("a = 4.94065645841246544177e-324;\n", &.{
15861585
.identifier,
15871586
.equal,
@@ -1590,7 +1589,7 @@ test "tokenizer - float literal e exponent" {
15901589
});
15911590
}
15921591

1593-
test "tokenizer - float literal p exponent" {
1592+
test "float literal p exponent" {
15941593
try testTokenize("a = 0x1.a827999fcef32p+1022;\n", &.{
15951594
.identifier,
15961595
.equal,
@@ -1599,19 +1598,19 @@ test "tokenizer - float literal p exponent" {
15991598
});
16001599
}
16011600

1602-
test "tokenizer - chars" {
1601+
test "chars" {
16031602
try testTokenize("'c'", &.{.char_literal});
16041603
}
16051604

1606-
test "tokenizer - invalid token characters" {
1605+
test "invalid token characters" {
16071606
try testTokenize("#", &.{.invalid});
16081607
try testTokenize("`", &.{.invalid});
16091608
try testTokenize("'c", &.{.invalid});
16101609
try testTokenize("'", &.{.invalid});
16111610
try testTokenize("''", &.{ .invalid, .invalid });
16121611
}
16131612

1614-
test "tokenizer - invalid literal/comment characters" {
1613+
test "invalid literal/comment characters" {
16151614
try testTokenize("\"\x00\"", &.{
16161615
.string_literal,
16171616
.invalid,
@@ -1627,12 +1626,12 @@ test "tokenizer - invalid literal/comment characters" {
16271626
});
16281627
}
16291628

1630-
test "tokenizer - utf8" {
1629+
test "utf8" {
16311630
try testTokenize("//\xc2\x80", &.{});
16321631
try testTokenize("//\xf4\x8f\xbf\xbf", &.{});
16331632
}
16341633

1635-
test "tokenizer - invalid utf8" {
1634+
test "invalid utf8" {
16361635
try testTokenize("//\x80", &.{
16371636
.invalid,
16381637
});
@@ -1659,7 +1658,7 @@ test "tokenizer - invalid utf8" {
16591658
});
16601659
}
16611660

1662-
test "tokenizer - illegal unicode codepoints" {
1661+
test "illegal unicode codepoints" {
16631662
// unicode newline characters.U+0085, U+2028, U+2029
16641663
try testTokenize("//\xc2\x84", &.{});
16651664
try testTokenize("//\xc2\x85", &.{
@@ -1676,7 +1675,7 @@ test "tokenizer - illegal unicode codepoints" {
16761675
try testTokenize("//\xe2\x80\xaa", &.{});
16771676
}
16781677

1679-
test "tokenizer - string identifier and builtin fns" {
1678+
test "string identifier and builtin fns" {
16801679
try testTokenize(
16811680
\\const @"if" = @import("std");
16821681
, &.{
@@ -1691,15 +1690,15 @@ test "tokenizer - string identifier and builtin fns" {
16911690
});
16921691
}
16931692

1694-
test "tokenizer - multiline string literal with literal tab" {
1693+
test "multiline string literal with literal tab" {
16951694
try testTokenize(
16961695
\\\\foo bar
16971696
, &.{
16981697
.multiline_string_literal_line,
16991698
});
17001699
}
17011700

1702-
test "tokenizer - comments with literal tab" {
1701+
test "comments with literal tab" {
17031702
try testTokenize(
17041703
\\//foo bar
17051704
\\//!foo bar
@@ -1715,14 +1714,14 @@ test "tokenizer - comments with literal tab" {
17151714
});
17161715
}
17171716

1718-
test "tokenizer - pipe and then invalid" {
1717+
test "pipe and then invalid" {
17191718
try testTokenize("||=", &.{
17201719
.pipe_pipe,
17211720
.equal,
17221721
});
17231722
}
17241723

1725-
test "tokenizer - line comment and doc comment" {
1724+
test "line comment and doc comment" {
17261725
try testTokenize("//", &.{});
17271726
try testTokenize("// a / b", &.{});
17281727
try testTokenize("// /", &.{});
@@ -1733,7 +1732,7 @@ test "tokenizer - line comment and doc comment" {
17331732
try testTokenize("//!!", &.{.container_doc_comment});
17341733
}
17351734

1736-
test "tokenizer - line comment followed by identifier" {
1735+
test "line comment followed by identifier" {
17371736
try testTokenize(
17381737
\\ Unexpected,
17391738
\\ // another
@@ -1746,7 +1745,7 @@ test "tokenizer - line comment followed by identifier" {
17461745
});
17471746
}
17481747

1749-
test "tokenizer - UTF-8 BOM is recognized and skipped" {
1748+
test "UTF-8 BOM is recognized and skipped" {
17501749
try testTokenize("\xEF\xBB\xBFa;\n", &.{
17511750
.identifier,
17521751
.semicolon,
@@ -1788,15 +1787,15 @@ test "correctly parse pointer dereference followed by asterisk" {
17881787
});
17891788
}
17901789

1791-
test "tokenizer - range literals" {
1790+
test "range literals" {
17921791
try testTokenize("0...9", &.{ .integer_literal, .ellipsis3, .integer_literal });
17931792
try testTokenize("'0'...'9'", &.{ .char_literal, .ellipsis3, .char_literal });
17941793
try testTokenize("0x00...0x09", &.{ .integer_literal, .ellipsis3, .integer_literal });
17951794
try testTokenize("0b00...0b11", &.{ .integer_literal, .ellipsis3, .integer_literal });
17961795
try testTokenize("0o00...0o11", &.{ .integer_literal, .ellipsis3, .integer_literal });
17971796
}
17981797

1799-
test "tokenizer - number literals decimal" {
1798+
test "number literals decimal" {
18001799
try testTokenize("0", &.{.integer_literal});
18011800
try testTokenize("1", &.{.integer_literal});
18021801
try testTokenize("2", &.{.integer_literal});
@@ -1863,7 +1862,7 @@ test "tokenizer - number literals decimal" {
18631862
try testTokenize("1.0e0_+", &.{ .invalid, .plus });
18641863
}
18651864

1866-
test "tokenizer - number literals binary" {
1865+
test "number literals binary" {
18671866
try testTokenize("0b0", &.{.integer_literal});
18681867
try testTokenize("0b1", &.{.integer_literal});
18691868
try testTokenize("0b2", &.{ .invalid, .integer_literal });
@@ -1902,7 +1901,7 @@ test "tokenizer - number literals binary" {
19021901
try testTokenize("0b1_,", &.{ .invalid, .comma });
19031902
}
19041903

1905-
test "tokenizer - number literals octal" {
1904+
test "number literals octal" {
19061905
try testTokenize("0o0", &.{.integer_literal});
19071906
try testTokenize("0o1", &.{.integer_literal});
19081907
try testTokenize("0o2", &.{.integer_literal});
@@ -1941,7 +1940,7 @@ test "tokenizer - number literals octal" {
19411940
try testTokenize("0o_,", &.{ .invalid, .identifier, .comma });
19421941
}
19431942

1944-
test "tokenizer - number literals hexadecimal" {
1943+
test "number literals hexadecimal" {
19451944
try testTokenize("0x0", &.{.integer_literal});
19461945
try testTokenize("0x1", &.{.integer_literal});
19471946
try testTokenize("0x2", &.{.integer_literal});
@@ -2029,22 +2028,22 @@ test "tokenizer - number literals hexadecimal" {
20292028
try testTokenize("0x0.0p0_", &.{ .invalid, .eof });
20302029
}
20312030

2032-
test "tokenizer - multi line string literal with only 1 backslash" {
2031+
test "multi line string literal with only 1 backslash" {
20332032
try testTokenize("x \\\n;", &.{ .identifier, .invalid, .semicolon });
20342033
}
20352034

2036-
test "tokenizer - invalid builtin identifiers" {
2035+
test "invalid builtin identifiers" {
20372036
try testTokenize("@()", &.{ .invalid, .l_paren, .r_paren });
20382037
try testTokenize("@0()", &.{ .invalid, .integer_literal, .l_paren, .r_paren });
20392038
}
20402039

2041-
test "tokenizer - invalid token with unfinished escape right before eof" {
2040+
test "invalid token with unfinished escape right before eof" {
20422041
try testTokenize("\"\\", &.{.invalid});
20432042
try testTokenize("'\\", &.{.invalid});
20442043
try testTokenize("'\\u", &.{.invalid});
20452044
}
20462045

2047-
test "tokenizer - saturating" {
2046+
test "saturating operators" {
20482047
try testTokenize("<<", &.{.angle_bracket_angle_bracket_left});
20492048
try testTokenize("<<|", &.{.angle_bracket_angle_bracket_left_pipe});
20502049
try testTokenize("<<|=", &.{.angle_bracket_angle_bracket_left_pipe_equal});
@@ -2062,17 +2061,14 @@ test "tokenizer - saturating" {
20622061
try testTokenize("-|=", &.{.minus_pipe_equal});
20632062
}
20642063

2065-
fn testTokenize(source: [:0]const u8, expected_tokens: []const Token.Tag) !void {
2064+
fn testTokenize(source: [:0]const u8, expected_token_tags: []const Token.Tag) !void {
20662065
var tokenizer = Tokenizer.init(source);
2067-
for (expected_tokens) |expected_token_id| {
2066+
for (expected_token_tags) |expected_token_tag| {
20682067
const token = tokenizer.next();
2069-
if (token.tag != expected_token_id) {
2070-
std.debug.panic("expected {s}, found {s}\n", .{
2071-
@tagName(expected_token_id), @tagName(token.tag),
2072-
});
2073-
}
2068+
try std.testing.expectEqual(expected_token_tag, token.tag);
20742069
}
20752070
const last_token = tokenizer.next();
20762071
try std.testing.expectEqual(Token.Tag.eof, last_token.tag);
20772072
try std.testing.expectEqual(source.len, last_token.loc.start);
2073+
try std.testing.expectEqual(source.len, last_token.loc.end);
20782074
}

0 commit comments

Comments
 (0)