diff --git a/lib/std/json.zig b/lib/std/json.zig index a7e98ad1a5ab..481ca7dd954f 100644 --- a/lib/std/json.zig +++ b/lib/std/json.zig @@ -87,6 +87,8 @@ pub const StreamingParser = struct { string_last_was_high_surrogate: bool, // Used inside of StringEscapeHexUnicode* states string_unicode_codepoint: u21, + // The first byte needs to be stored to validate 3- and 4-byte sequences. + sequence_first_byte: u8 = undefined, // When in .Number states, is the number a (still) valid integer? number_is_integer: bool, @@ -132,9 +134,12 @@ pub const StreamingParser = struct { ValueBeginNoClosing, String, - StringUtf8Byte3, - StringUtf8Byte2, - StringUtf8Byte1, + StringUtf8Byte2Of2, + StringUtf8Byte2Of3, + StringUtf8Byte3Of3, + StringUtf8Byte2Of4, + StringUtf8Byte3Of4, + StringUtf8Byte4Of4, StringEscapeCharacter, StringEscapeHexUnicode4, StringEscapeHexUnicode3, @@ -581,35 +586,68 @@ pub const StreamingParser = struct { // non-control ascii p.string_last_was_high_surrogate = false; }, - 0xC0...0xDF => { - p.state = .StringUtf8Byte1; + 0xC2...0xDF => { + p.state = .StringUtf8Byte2Of2; }, 0xE0...0xEF => { - p.state = .StringUtf8Byte2; + p.state = .StringUtf8Byte2Of3; + p.sequence_first_byte = c; }, - 0xF0...0xFF => { - p.state = .StringUtf8Byte3; + 0xF0...0xF4 => { + p.state = .StringUtf8Byte2Of4; + p.sequence_first_byte = c; }, else => { return error.InvalidUtf8Byte; }, }, - .StringUtf8Byte3 => switch (c >> 6) { - 0b10 => p.state = .StringUtf8Byte2, + .StringUtf8Byte2Of2 => switch (c >> 6) { + 0b10 => p.state = .String, else => return error.InvalidUtf8Byte, }, - - .StringUtf8Byte2 => switch (c >> 6) { - 0b10 => p.state = .StringUtf8Byte1, + .StringUtf8Byte2Of3 => { + switch (p.sequence_first_byte) { + 0xE0 => switch (c) { + 0xA0...0xBF => {}, + else => return error.InvalidUtf8Byte, + }, + 0xE1...0xEF => switch (c) { + 0x80...0xBF => {}, + else => return error.InvalidUtf8Byte, + }, + else => return error.InvalidUtf8Byte, + } + p.state = .StringUtf8Byte3Of3; + }, + .StringUtf8Byte3Of3 => switch (c) { + 0x80...0xBF => p.state = .String, else => return error.InvalidUtf8Byte, }, - - .StringUtf8Byte1 => switch (c >> 6) { - 0b10 => { - p.state = .String; - p.string_last_was_high_surrogate = false; - }, + .StringUtf8Byte2Of4 => { + switch (p.sequence_first_byte) { + 0xF0 => switch (c) { + 0x90...0xBF => {}, + else => return error.InvalidUtf8Byte, + }, + 0xF1...0xF3 => switch (c) { + 0x80...0xBF => {}, + else => return error.InvalidUtf8Byte, + }, + 0xF4 => switch (c) { + 0x80...0x8F => {}, + else => return error.InvalidUtf8Byte, + }, + else => return error.InvalidUtf8Byte, + } + p.state = .StringUtf8Byte3Of4; + }, + .StringUtf8Byte3Of4 => switch (c) { + 0x80...0xBF => p.state = .StringUtf8Byte4Of4, + else => return error.InvalidUtf8Byte, + }, + .StringUtf8Byte4Of4 => switch (c) { + 0x80...0xBF => p.state = .String, else => return error.InvalidUtf8Byte, }, diff --git a/lib/std/json/test.zig b/lib/std/json/test.zig index 2dc5b860ed0c..f1876048ace6 100644 --- a/lib/std/json/test.zig +++ b/lib/std/json/test.zig @@ -27,6 +27,20 @@ fn err(comptime s: []const u8) void { } else |_| {} } +fn utf8Error(comptime s: []const u8) void { + std.testing.expect(!std.json.validate(s)); + + var mem_buffer: [1024 * 20]u8 = undefined; + const allocator = &std.heap.FixedBufferAllocator.init(&mem_buffer).allocator; + var p = std.json.Parser.init(allocator, false); + + if (p.parse(s)) |_| { + unreachable; + } else |e| { + std.testing.expect(e == error.InvalidUtf8Byte); + } +} + fn any(comptime s: []const u8) void { _ = std.json.validate(s); @@ -1936,3 +1950,55 @@ test "i_structure_UTF-8_BOM_empty_object" { \\{} ); } + +test "truncated UTF-8 sequence" { + utf8Error("\"\xc2\""); + utf8Error("\"\xdf\""); + utf8Error("\"\xed\xa0\""); + utf8Error("\"\xf0\x80\""); + utf8Error("\"\xf0\x80\x80\""); +} + +test "invalid continuation byte" { + utf8Error("\"\xc2\x00\""); + utf8Error("\"\xc2\x7f\""); + utf8Error("\"\xc2\xc0\""); + utf8Error("\"\xc3\xc1\""); + utf8Error("\"\xc4\xf5\""); + utf8Error("\"\xc5\xff\""); + utf8Error("\"\xe4\x80\x00\""); + utf8Error("\"\xe5\x80\x10\""); + utf8Error("\"\xe6\x80\xc0\""); + utf8Error("\"\xe7\x80\xf5\""); + utf8Error("\"\xe8\x00\x80\""); + utf8Error("\"\xf2\x00\x80\x80\""); + utf8Error("\"\xf0\x80\x00\x80\""); + utf8Error("\"\xf1\x80\xc0\x80\""); + utf8Error("\"\xf2\x80\x80\x00\""); + utf8Error("\"\xf3\x80\x80\xc0\""); + utf8Error("\"\xf4\x80\x80\xf5\""); +} + +test "disallowed overlong form" { + utf8Error("\"\xc0\x80\""); + utf8Error("\"\xc0\x90\""); + utf8Error("\"\xc1\x80\""); + utf8Error("\"\xc1\x90\""); + utf8Error("\"\xe0\x80\x80\""); + utf8Error("\"\xf0\x80\x80\x80\""); +} + +test "out of UTF-16 range" { + utf8Error("\"\xf4\x90\x80\x80\""); + utf8Error("\"\xf5\x80\x80\x80\""); + utf8Error("\"\xf6\x80\x80\x80\""); + utf8Error("\"\xf7\x80\x80\x80\""); + utf8Error("\"\xf8\x80\x80\x80\""); + utf8Error("\"\xf9\x80\x80\x80\""); + utf8Error("\"\xfa\x80\x80\x80\""); + utf8Error("\"\xfb\x80\x80\x80\""); + utf8Error("\"\xfc\x80\x80\x80\""); + utf8Error("\"\xfd\x80\x80\x80\""); + utf8Error("\"\xfe\x80\x80\x80\""); + utf8Error("\"\xff\x80\x80\x80\""); +}