Skip to content

Commit 2933a82

Browse files
hryxandrewrk
authored andcommitted
json: disallow overlong and out-of-range UTF-8
Fixes #2379 = Overlong (non-shortest) sequences UTF-8's unique encoding scheme allows for some Unicode codepoints to be represented in multiple ways. For any of these characters, the spec forbids all but the shortest form. These disallowed longer sequences are called "overlong". As an interesting side effect of this rule, the bytes C0 and C1 never appear in valid UTF-8. = Codepoint range UTF-8 disallows representation of codepoints beyond U+10FFFF, which is the highest character which can be encoded in UTF-16. Because a 4-byte sequence is capable of resulting in such characters, they must be explicitly rejected. This rule also has an interesting side effect, which is that bytes F5 to FF never appear. = References Detecting an overlong version of a codepoint could get gnarly, but luckily The Unicode Consortium did the hard work by creating this handy table of valid byte sequences: https://unicode.org/versions/corrigendum1.html I thought this mapped nicely to the parser's state machine, so I rearranged the relevant states to make use of it.
1 parent 9390e8b commit 2933a82

File tree

2 files changed

+123
-19
lines changed

2 files changed

+123
-19
lines changed

lib/std/json.zig

Lines changed: 57 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@ pub const StreamingParser = struct {
8787
string_last_was_high_surrogate: bool,
8888
// Used inside of StringEscapeHexUnicode* states
8989
string_unicode_codepoint: u21,
90+
// The first byte needs to be stored to validate 3- and 4-byte sequences.
91+
sequence_first_byte: u8 = undefined,
9092
// When in .Number states, is the number a (still) valid integer?
9193
number_is_integer: bool,
9294

@@ -132,9 +134,12 @@ pub const StreamingParser = struct {
132134
ValueBeginNoClosing,
133135

134136
String,
135-
StringUtf8Byte3,
136-
StringUtf8Byte2,
137-
StringUtf8Byte1,
137+
StringUtf8Byte2Of2,
138+
StringUtf8Byte2Of3,
139+
StringUtf8Byte3Of3,
140+
StringUtf8Byte2Of4,
141+
StringUtf8Byte3Of4,
142+
StringUtf8Byte4Of4,
138143
StringEscapeCharacter,
139144
StringEscapeHexUnicode4,
140145
StringEscapeHexUnicode3,
@@ -581,35 +586,68 @@ pub const StreamingParser = struct {
581586
// non-control ascii
582587
p.string_last_was_high_surrogate = false;
583588
},
584-
0xC0...0xDF => {
585-
p.state = .StringUtf8Byte1;
589+
0xC2...0xDF => {
590+
p.state = .StringUtf8Byte2Of2;
586591
},
587592
0xE0...0xEF => {
588-
p.state = .StringUtf8Byte2;
593+
p.state = .StringUtf8Byte2Of3;
594+
p.sequence_first_byte = c;
589595
},
590-
0xF0...0xFF => {
591-
p.state = .StringUtf8Byte3;
596+
0xF0...0xF4 => {
597+
p.state = .StringUtf8Byte2Of4;
598+
p.sequence_first_byte = c;
592599
},
593600
else => {
594601
return error.InvalidUtf8Byte;
595602
},
596603
},
597604

598-
.StringUtf8Byte3 => switch (c >> 6) {
599-
0b10 => p.state = .StringUtf8Byte2,
605+
.StringUtf8Byte2Of2 => switch (c >> 6) {
606+
0b10 => p.state = .String,
600607
else => return error.InvalidUtf8Byte,
601608
},
602-
603-
.StringUtf8Byte2 => switch (c >> 6) {
604-
0b10 => p.state = .StringUtf8Byte1,
609+
.StringUtf8Byte2Of3 => {
610+
switch (p.sequence_first_byte) {
611+
0xE0 => switch (c) {
612+
0xA0...0xBF => {},
613+
else => return error.InvalidUtf8Byte,
614+
},
615+
0xE1...0xEF => switch (c) {
616+
0x80...0xBF => {},
617+
else => return error.InvalidUtf8Byte,
618+
},
619+
else => return error.InvalidUtf8Byte,
620+
}
621+
p.state = .StringUtf8Byte3Of3;
622+
},
623+
.StringUtf8Byte3Of3 => switch (c) {
624+
0x80...0xBF => p.state = .String,
605625
else => return error.InvalidUtf8Byte,
606626
},
607-
608-
.StringUtf8Byte1 => switch (c >> 6) {
609-
0b10 => {
610-
p.state = .String;
611-
p.string_last_was_high_surrogate = false;
612-
},
627+
.StringUtf8Byte2Of4 => {
628+
switch (p.sequence_first_byte) {
629+
0xF0 => switch (c) {
630+
0x90...0xBF => {},
631+
else => return error.InvalidUtf8Byte,
632+
},
633+
0xF1...0xF3 => switch (c) {
634+
0x80...0xBF => {},
635+
else => return error.InvalidUtf8Byte,
636+
},
637+
0xF4 => switch (c) {
638+
0x80...0x8F => {},
639+
else => return error.InvalidUtf8Byte,
640+
},
641+
else => return error.InvalidUtf8Byte,
642+
}
643+
p.state = .StringUtf8Byte3Of4;
644+
},
645+
.StringUtf8Byte3Of4 => switch (c) {
646+
0x80...0xBF => p.state = .StringUtf8Byte4Of4,
647+
else => return error.InvalidUtf8Byte,
648+
},
649+
.StringUtf8Byte4Of4 => switch (c) {
650+
0x80...0xBF => p.state = .String,
613651
else => return error.InvalidUtf8Byte,
614652
},
615653

lib/std/json/test.zig

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,20 @@ fn err(comptime s: []const u8) void {
2727
} else |_| {}
2828
}
2929

30+
fn utf8Error(comptime s: []const u8) void {
31+
std.testing.expect(!std.json.validate(s));
32+
33+
var mem_buffer: [1024 * 20]u8 = undefined;
34+
const allocator = &std.heap.FixedBufferAllocator.init(&mem_buffer).allocator;
35+
var p = std.json.Parser.init(allocator, false);
36+
37+
if (p.parse(s)) |_| {
38+
unreachable;
39+
} else |e| {
40+
std.testing.expect(e == error.InvalidUtf8Byte);
41+
}
42+
}
43+
3044
fn any(comptime s: []const u8) void {
3145
_ = std.json.validate(s);
3246

@@ -1936,3 +1950,55 @@ test "i_structure_UTF-8_BOM_empty_object" {
19361950
\\{}
19371951
);
19381952
}
1953+
1954+
test "truncated UTF-8 sequence" {
1955+
utf8Error("\"\xc2\"");
1956+
utf8Error("\"\xdf\"");
1957+
utf8Error("\"\xed\xa0\"");
1958+
utf8Error("\"\xf0\x80\"");
1959+
utf8Error("\"\xf0\x80\x80\"");
1960+
}
1961+
1962+
test "invalid continuation byte" {
1963+
utf8Error("\"\xc2\x00\"");
1964+
utf8Error("\"\xc2\x7f\"");
1965+
utf8Error("\"\xc2\xc0\"");
1966+
utf8Error("\"\xc3\xc1\"");
1967+
utf8Error("\"\xc4\xf5\"");
1968+
utf8Error("\"\xc5\xff\"");
1969+
utf8Error("\"\xe4\x80\x00\"");
1970+
utf8Error("\"\xe5\x80\x10\"");
1971+
utf8Error("\"\xe6\x80\xc0\"");
1972+
utf8Error("\"\xe7\x80\xf5\"");
1973+
utf8Error("\"\xe8\x00\x80\"");
1974+
utf8Error("\"\xf2\x00\x80\x80\"");
1975+
utf8Error("\"\xf0\x80\x00\x80\"");
1976+
utf8Error("\"\xf1\x80\xc0\x80\"");
1977+
utf8Error("\"\xf2\x80\x80\x00\"");
1978+
utf8Error("\"\xf3\x80\x80\xc0\"");
1979+
utf8Error("\"\xf4\x80\x80\xf5\"");
1980+
}
1981+
1982+
test "disallowed overlong form" {
1983+
utf8Error("\"\xc0\x80\"");
1984+
utf8Error("\"\xc0\x90\"");
1985+
utf8Error("\"\xc1\x80\"");
1986+
utf8Error("\"\xc1\x90\"");
1987+
utf8Error("\"\xe0\x80\x80\"");
1988+
utf8Error("\"\xf0\x80\x80\x80\"");
1989+
}
1990+
1991+
test "out of UTF-16 range" {
1992+
utf8Error("\"\xf4\x90\x80\x80\"");
1993+
utf8Error("\"\xf5\x80\x80\x80\"");
1994+
utf8Error("\"\xf6\x80\x80\x80\"");
1995+
utf8Error("\"\xf7\x80\x80\x80\"");
1996+
utf8Error("\"\xf8\x80\x80\x80\"");
1997+
utf8Error("\"\xf9\x80\x80\x80\"");
1998+
utf8Error("\"\xfa\x80\x80\x80\"");
1999+
utf8Error("\"\xfb\x80\x80\x80\"");
2000+
utf8Error("\"\xfc\x80\x80\x80\"");
2001+
utf8Error("\"\xfd\x80\x80\x80\"");
2002+
utf8Error("\"\xfe\x80\x80\x80\"");
2003+
utf8Error("\"\xff\x80\x80\x80\"");
2004+
}

0 commit comments

Comments
 (0)