1
1
const std = @import ("../std.zig" );
2
- const mem = std .mem ;
3
2
4
3
pub const Token = struct {
5
4
tag : Tag ,
@@ -350,7 +349,7 @@ pub const Tokenizer = struct {
350
349
351
350
pub fn init (buffer : [:0 ]const u8 ) Tokenizer {
352
351
// Skip the UTF-8 BOM if present
353
- const src_start = if (mem .startsWith (u8 , buffer , "\xEF\xBB\xBF " )) 3 else @as ( usize , 0 ) ;
352
+ const src_start : usize = if (std . mem .startsWith (u8 , buffer , "\xEF\xBB\xBF " )) 3 else 0 ;
354
353
return Tokenizer {
355
354
.buffer = buffer ,
356
355
.index = src_start ,
@@ -1433,8 +1432,8 @@ pub const Tokenizer = struct {
1433
1432
1434
1433
fn getInvalidCharacterLength (self : * Tokenizer ) u3 {
1435
1434
const c0 = self .buffer [self .index ];
1436
- if (c0 < 0x80 ) {
1437
- if (c0 < 0x20 or c0 == 0x7f ) {
1435
+ if (std . ascii . isASCII ( c0 ) ) {
1436
+ if (std . ascii . isCntrl ( c0 ) ) {
1438
1437
// ascii control codes are never allowed
1439
1438
// (note that \n was checked before we got here)
1440
1439
return 1 ;
@@ -1469,8 +1468,8 @@ pub const Tokenizer = struct {
1469
1468
}
1470
1469
};
1471
1470
1472
- test "tokenizer " {
1473
- try testTokenize ("test" , &.{.keyword_test });
1471
+ test "keywords " {
1472
+ try testTokenize ("test const else " , &.{ .keyword_test , .keyword_const , .keyword_else });
1474
1473
}
1475
1474
1476
1475
test "line comment followed by top-level comptime" {
@@ -1485,7 +1484,7 @@ test "line comment followed by top-level comptime" {
1485
1484
});
1486
1485
}
1487
1486
1488
- test "tokenizer - unknown length pointer and then c pointer" {
1487
+ test "unknown length pointer and then c pointer" {
1489
1488
try testTokenize (
1490
1489
\\[*]u8
1491
1490
\\[*c]u8
@@ -1502,7 +1501,7 @@ test "tokenizer - unknown length pointer and then c pointer" {
1502
1501
});
1503
1502
}
1504
1503
1505
- test "tokenizer - code point literal with hex escape" {
1504
+ test "code point literal with hex escape" {
1506
1505
try testTokenize (
1507
1506
\\'\x1b'
1508
1507
, &.{.char_literal });
@@ -1511,21 +1510,21 @@ test "tokenizer - code point literal with hex escape" {
1511
1510
, &.{ .invalid , .invalid });
1512
1511
}
1513
1512
1514
- test "tokenizer - newline in char literal" {
1513
+ test "newline in char literal" {
1515
1514
try testTokenize (
1516
1515
\\'
1517
1516
\\'
1518
1517
, &.{ .invalid , .invalid });
1519
1518
}
1520
1519
1521
- test "tokenizer - newline in string literal" {
1520
+ test "newline in string literal" {
1522
1521
try testTokenize (
1523
1522
\\"
1524
1523
\\"
1525
1524
, &.{ .invalid , .string_literal });
1526
1525
}
1527
1526
1528
- test "tokenizer - code point literal with unicode escapes" {
1527
+ test "code point literal with unicode escapes" {
1529
1528
// Valid unicode escapes
1530
1529
try testTokenize (
1531
1530
\\'\u{3}'
@@ -1575,13 +1574,13 @@ test "tokenizer - code point literal with unicode escapes" {
1575
1574
, &.{ .invalid , .integer_literal , .invalid });
1576
1575
}
1577
1576
1578
- test "tokenizer - code point literal with unicode code point" {
1577
+ test "code point literal with unicode code point" {
1579
1578
try testTokenize (
1580
1579
\\'💩'
1581
1580
, &.{.char_literal });
1582
1581
}
1583
1582
1584
- test "tokenizer - float literal e exponent" {
1583
+ test "float literal e exponent" {
1585
1584
try testTokenize ("a = 4.94065645841246544177e-324;\n " , &.{
1586
1585
.identifier ,
1587
1586
.equal ,
@@ -1590,7 +1589,7 @@ test "tokenizer - float literal e exponent" {
1590
1589
});
1591
1590
}
1592
1591
1593
- test "tokenizer - float literal p exponent" {
1592
+ test "float literal p exponent" {
1594
1593
try testTokenize ("a = 0x1.a827999fcef32p+1022;\n " , &.{
1595
1594
.identifier ,
1596
1595
.equal ,
@@ -1599,19 +1598,19 @@ test "tokenizer - float literal p exponent" {
1599
1598
});
1600
1599
}
1601
1600
1602
- test "tokenizer - chars" {
1601
+ test "chars" {
1603
1602
try testTokenize ("'c'" , &.{.char_literal });
1604
1603
}
1605
1604
1606
- test "tokenizer - invalid token characters" {
1605
+ test "invalid token characters" {
1607
1606
try testTokenize ("#" , &.{.invalid });
1608
1607
try testTokenize ("`" , &.{.invalid });
1609
1608
try testTokenize ("'c" , &.{.invalid });
1610
1609
try testTokenize ("'" , &.{.invalid });
1611
1610
try testTokenize ("''" , &.{ .invalid , .invalid });
1612
1611
}
1613
1612
1614
- test "tokenizer - invalid literal/comment characters" {
1613
+ test "invalid literal/comment characters" {
1615
1614
try testTokenize ("\" \x00 \" " , &.{
1616
1615
.string_literal ,
1617
1616
.invalid ,
@@ -1627,12 +1626,12 @@ test "tokenizer - invalid literal/comment characters" {
1627
1626
});
1628
1627
}
1629
1628
1630
- test "tokenizer - utf8" {
1629
+ test "utf8" {
1631
1630
try testTokenize ("//\xc2\x80 " , &.{});
1632
1631
try testTokenize ("//\xf4\x8f\xbf\xbf " , &.{});
1633
1632
}
1634
1633
1635
- test "tokenizer - invalid utf8" {
1634
+ test "invalid utf8" {
1636
1635
try testTokenize ("//\x80 " , &.{
1637
1636
.invalid ,
1638
1637
});
@@ -1659,7 +1658,7 @@ test "tokenizer - invalid utf8" {
1659
1658
});
1660
1659
}
1661
1660
1662
- test "tokenizer - illegal unicode codepoints" {
1661
+ test "illegal unicode codepoints" {
1663
1662
// unicode newline characters.U+0085, U+2028, U+2029
1664
1663
try testTokenize ("//\xc2\x84 " , &.{});
1665
1664
try testTokenize ("//\xc2\x85 " , &.{
@@ -1676,7 +1675,7 @@ test "tokenizer - illegal unicode codepoints" {
1676
1675
try testTokenize ("//\xe2\x80\xaa " , &.{});
1677
1676
}
1678
1677
1679
- test "tokenizer - string identifier and builtin fns" {
1678
+ test "string identifier and builtin fns" {
1680
1679
try testTokenize (
1681
1680
\\const @"if" = @import("std");
1682
1681
, &.{
@@ -1691,15 +1690,15 @@ test "tokenizer - string identifier and builtin fns" {
1691
1690
});
1692
1691
}
1693
1692
1694
- test "tokenizer - multiline string literal with literal tab" {
1693
+ test "multiline string literal with literal tab" {
1695
1694
try testTokenize (
1696
1695
\\\\foo bar
1697
1696
, &.{
1698
1697
.multiline_string_literal_line ,
1699
1698
});
1700
1699
}
1701
1700
1702
- test "tokenizer - comments with literal tab" {
1701
+ test "comments with literal tab" {
1703
1702
try testTokenize (
1704
1703
\\//foo bar
1705
1704
\\//!foo bar
@@ -1715,14 +1714,14 @@ test "tokenizer - comments with literal tab" {
1715
1714
});
1716
1715
}
1717
1716
1718
- test "tokenizer - pipe and then invalid" {
1717
+ test "pipe and then invalid" {
1719
1718
try testTokenize ("||=" , &.{
1720
1719
.pipe_pipe ,
1721
1720
.equal ,
1722
1721
});
1723
1722
}
1724
1723
1725
- test "tokenizer - line comment and doc comment" {
1724
+ test "line comment and doc comment" {
1726
1725
try testTokenize ("//" , &.{});
1727
1726
try testTokenize ("// a / b" , &.{});
1728
1727
try testTokenize ("// /" , &.{});
@@ -1733,7 +1732,7 @@ test "tokenizer - line comment and doc comment" {
1733
1732
try testTokenize ("//!!" , &.{.container_doc_comment });
1734
1733
}
1735
1734
1736
- test "tokenizer - line comment followed by identifier" {
1735
+ test "line comment followed by identifier" {
1737
1736
try testTokenize (
1738
1737
\\ Unexpected,
1739
1738
\\ // another
@@ -1746,7 +1745,7 @@ test "tokenizer - line comment followed by identifier" {
1746
1745
});
1747
1746
}
1748
1747
1749
- test "tokenizer - UTF-8 BOM is recognized and skipped" {
1748
+ test "UTF-8 BOM is recognized and skipped" {
1750
1749
try testTokenize ("\xEF\xBB\xBF a;\n " , &.{
1751
1750
.identifier ,
1752
1751
.semicolon ,
@@ -1788,15 +1787,15 @@ test "correctly parse pointer dereference followed by asterisk" {
1788
1787
});
1789
1788
}
1790
1789
1791
- test "tokenizer - range literals" {
1790
+ test "range literals" {
1792
1791
try testTokenize ("0...9" , &.{ .integer_literal , .ellipsis3 , .integer_literal });
1793
1792
try testTokenize ("'0'...'9'" , &.{ .char_literal , .ellipsis3 , .char_literal });
1794
1793
try testTokenize ("0x00...0x09" , &.{ .integer_literal , .ellipsis3 , .integer_literal });
1795
1794
try testTokenize ("0b00...0b11" , &.{ .integer_literal , .ellipsis3 , .integer_literal });
1796
1795
try testTokenize ("0o00...0o11" , &.{ .integer_literal , .ellipsis3 , .integer_literal });
1797
1796
}
1798
1797
1799
- test "tokenizer - number literals decimal" {
1798
+ test "number literals decimal" {
1800
1799
try testTokenize ("0" , &.{.integer_literal });
1801
1800
try testTokenize ("1" , &.{.integer_literal });
1802
1801
try testTokenize ("2" , &.{.integer_literal });
@@ -1863,7 +1862,7 @@ test "tokenizer - number literals decimal" {
1863
1862
try testTokenize ("1.0e0_+" , &.{ .invalid , .plus });
1864
1863
}
1865
1864
1866
- test "tokenizer - number literals binary" {
1865
+ test "number literals binary" {
1867
1866
try testTokenize ("0b0" , &.{.integer_literal });
1868
1867
try testTokenize ("0b1" , &.{.integer_literal });
1869
1868
try testTokenize ("0b2" , &.{ .invalid , .integer_literal });
@@ -1902,7 +1901,7 @@ test "tokenizer - number literals binary" {
1902
1901
try testTokenize ("0b1_," , &.{ .invalid , .comma });
1903
1902
}
1904
1903
1905
- test "tokenizer - number literals octal" {
1904
+ test "number literals octal" {
1906
1905
try testTokenize ("0o0" , &.{.integer_literal });
1907
1906
try testTokenize ("0o1" , &.{.integer_literal });
1908
1907
try testTokenize ("0o2" , &.{.integer_literal });
@@ -1941,7 +1940,7 @@ test "tokenizer - number literals octal" {
1941
1940
try testTokenize ("0o_," , &.{ .invalid , .identifier , .comma });
1942
1941
}
1943
1942
1944
- test "tokenizer - number literals hexadecimal" {
1943
+ test "number literals hexadecimal" {
1945
1944
try testTokenize ("0x0" , &.{.integer_literal });
1946
1945
try testTokenize ("0x1" , &.{.integer_literal });
1947
1946
try testTokenize ("0x2" , &.{.integer_literal });
@@ -2029,22 +2028,22 @@ test "tokenizer - number literals hexadecimal" {
2029
2028
try testTokenize ("0x0.0p0_" , &.{ .invalid , .eof });
2030
2029
}
2031
2030
2032
- test "tokenizer - multi line string literal with only 1 backslash" {
2031
+ test "multi line string literal with only 1 backslash" {
2033
2032
try testTokenize ("x \\ \n ;" , &.{ .identifier , .invalid , .semicolon });
2034
2033
}
2035
2034
2036
- test "tokenizer - invalid builtin identifiers" {
2035
+ test "invalid builtin identifiers" {
2037
2036
try testTokenize ("@()" , &.{ .invalid , .l_paren , .r_paren });
2038
2037
try testTokenize ("@0()" , &.{ .invalid , .integer_literal , .l_paren , .r_paren });
2039
2038
}
2040
2039
2041
- test "tokenizer - invalid token with unfinished escape right before eof" {
2040
+ test "invalid token with unfinished escape right before eof" {
2042
2041
try testTokenize ("\" \\ " , &.{.invalid });
2043
2042
try testTokenize ("'\\ " , &.{.invalid });
2044
2043
try testTokenize ("'\\ u" , &.{.invalid });
2045
2044
}
2046
2045
2047
- test "tokenizer - saturating" {
2046
+ test "saturating operators " {
2048
2047
try testTokenize ("<<" , &.{.angle_bracket_angle_bracket_left });
2049
2048
try testTokenize ("<<|" , &.{.angle_bracket_angle_bracket_left_pipe });
2050
2049
try testTokenize ("<<|=" , &.{.angle_bracket_angle_bracket_left_pipe_equal });
@@ -2062,17 +2061,14 @@ test "tokenizer - saturating" {
2062
2061
try testTokenize ("-|=" , &.{.minus_pipe_equal });
2063
2062
}
2064
2063
2065
- fn testTokenize (source : [:0 ]const u8 , expected_tokens : []const Token.Tag ) ! void {
2064
+ fn testTokenize (source : [:0 ]const u8 , expected_token_tags : []const Token.Tag ) ! void {
2066
2065
var tokenizer = Tokenizer .init (source );
2067
- for (expected_tokens ) | expected_token_id | {
2066
+ for (expected_token_tags ) | expected_token_tag | {
2068
2067
const token = tokenizer .next ();
2069
- if (token .tag != expected_token_id ) {
2070
- std .debug .panic ("expected {s}, found {s}\n " , .{
2071
- @tagName (expected_token_id ), @tagName (token .tag ),
2072
- });
2073
- }
2068
+ try std .testing .expectEqual (expected_token_tag , token .tag );
2074
2069
}
2075
2070
const last_token = tokenizer .next ();
2076
2071
try std .testing .expectEqual (Token .Tag .eof , last_token .tag );
2077
2072
try std .testing .expectEqual (source .len , last_token .loc .start );
2073
+ try std .testing .expectEqual (source .len , last_token .loc .end );
2078
2074
}
0 commit comments