Skip to content

Commit f6b6b8a

Browse files
committed
Add std.unicode.fmtUtf8 that can handle ill-formed UTF-8
Ill-formed UTF-8 byte sequences are replaced by the replacement character (U+FFFD) according to "U+FFFD Substitution of Maximal Subparts" from Chapter 3 of the Unicode standard, and as specified by https://encoding.spec.whatwg.org/#utf-8-decoder
1 parent 80508b9 commit f6b6b8a

File tree

1 file changed

+129
-3
lines changed

1 file changed

+129
-3
lines changed

lib/std/unicode.zig

Lines changed: 129 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -800,6 +800,129 @@ fn testDecode(bytes: []const u8) !u21 {
800800
return utf8Decode(bytes);
801801
}
802802

803+
/// Print the given `utf8` string, encoded as UTF-8 bytes.
804+
/// Ill-formed UTF-8 byte sequences are replaced by the replacement character (U+FFFD)
805+
/// according to "U+FFFD Substitution of Maximal Subparts" from Chapter 3 of
806+
/// the Unicode standard, and as specified by https://encoding.spec.whatwg.org/#utf-8-decoder
807+
fn formatUtf8(
808+
utf8: []const u8,
809+
comptime fmt: []const u8,
810+
options: std.fmt.FormatOptions,
811+
writer: anytype,
812+
) !void {
813+
_ = fmt;
814+
_ = options;
815+
var buf: [300]u8 = undefined; // just an arbitrary size
816+
var u8len: usize = 0;
817+
818+
// This implementation is based on this specification:
819+
// https://encoding.spec.whatwg.org/#utf-8-decoder
820+
var codepoint: u21 = 0;
821+
var cont_bytes_seen: u3 = 0;
822+
var cont_bytes_needed: u3 = 0;
823+
var lower_boundary: u8 = 0x80;
824+
var upper_boundary: u8 = 0xBF;
825+
826+
var i: usize = 0;
827+
while (i < utf8.len) {
828+
const byte = utf8[i];
829+
if (cont_bytes_needed == 0) {
830+
switch (byte) {
831+
0x00...0x7F => {
832+
buf[u8len] = byte;
833+
u8len += 1;
834+
},
835+
0xC2...0xDF => {
836+
cont_bytes_needed = 1;
837+
codepoint = byte & 0b00011111;
838+
},
839+
0xE0...0xEF => {
840+
if (byte == 0xE0) lower_boundary = 0xA0;
841+
if (byte == 0xED) upper_boundary = 0x9F;
842+
cont_bytes_needed = 2;
843+
codepoint = byte & 0b00001111;
844+
},
845+
0xF0...0xF4 => {
846+
if (byte == 0xF0) lower_boundary = 0x90;
847+
if (byte == 0xF4) upper_boundary = 0x8F;
848+
cont_bytes_needed = 3;
849+
codepoint = byte & 0b00000111;
850+
},
851+
else => {
852+
u8len += utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
853+
},
854+
}
855+
// consume the byte
856+
i += 1;
857+
} else if (byte < lower_boundary or byte > upper_boundary) {
858+
codepoint = 0;
859+
cont_bytes_needed = 0;
860+
cont_bytes_seen = 0;
861+
lower_boundary = 0x80;
862+
upper_boundary = 0xBF;
863+
u8len += utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
864+
// do not consume the current byte, it should now be treated as a possible start byte
865+
} else {
866+
lower_boundary = 0x80;
867+
upper_boundary = 0xBF;
868+
codepoint <<= 6;
869+
codepoint |= byte & 0b00111111;
870+
cont_bytes_seen += 1;
871+
// consume the byte
872+
i += 1;
873+
874+
if (cont_bytes_seen == cont_bytes_needed) {
875+
const codepoint_len = cont_bytes_seen + 1;
876+
const codepoint_start_i = i - codepoint_len;
877+
@memcpy(buf[u8len..][0..codepoint_len], utf8[codepoint_start_i..][0..codepoint_len]);
878+
u8len += codepoint_len;
879+
880+
codepoint = 0;
881+
cont_bytes_needed = 0;
882+
cont_bytes_seen = 0;
883+
}
884+
}
885+
// make sure there's always enough room for another maximum length UTF-8 codepoint
886+
if (u8len + 4 > buf.len) {
887+
try writer.writeAll(buf[0..u8len]);
888+
u8len = 0;
889+
}
890+
}
891+
if (cont_bytes_needed != 0) {
892+
// we know there's enough room because we always flush
893+
// if there's less than 4 bytes remaining in the buffer.
894+
u8len += utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
895+
}
896+
try writer.writeAll(buf[0..u8len]);
897+
}
898+
899+
/// Return a Formatter for a (potentially ill-formed) UTF-8 string.
900+
/// Ill-formed UTF-8 byte sequences are replaced by the replacement character (U+FFFD)
901+
/// according to "U+FFFD Substitution of Maximal Subparts" from Chapter 3 of
902+
/// the Unicode standard, and as specified by https://encoding.spec.whatwg.org/#utf-8-decoder
903+
pub fn fmtUtf8(utf8: []const u8) std.fmt.Formatter(formatUtf8) {
904+
return .{ .data = utf8 };
905+
}
906+
907+
test "fmtUtf8" {
908+
const expectFmt = testing.expectFmt;
909+
try expectFmt("", "{}", .{fmtUtf8("")});
910+
try expectFmt("foo", "{}", .{fmtUtf8("foo")});
911+
try expectFmt("𐐷", "{}", .{fmtUtf8("𐐷")});
912+
913+
// Table 3-8. U+FFFD for Non-Shortest Form Sequences
914+
try expectFmt("��������A", "{}", .{fmtUtf8("\xC0\xAF\xE0\x80\xBF\xF0\x81\x82A")});
915+
916+
// Table 3-9. U+FFFD for Ill-Formed Sequences for Surrogates
917+
try expectFmt("��������A", "{}", .{fmtUtf8("\xED\xA0\x80\xED\xBF\xBF\xED\xAFA")});
918+
919+
// Table 3-10. U+FFFD for Other Ill-Formed Sequences
920+
try expectFmt("�����A��B", "{}", .{fmtUtf8("\xF4\x91\x92\x93\xFFA\x80\xBFB")});
921+
922+
// Table 3-11. U+FFFD for Truncated Sequences
923+
try expectFmt("����A", "{}", .{fmtUtf8("\xE1\x80\xE2\xF0\x91\x92\xF1\xBFA")});
924+
}
925+
803926
fn utf16LeToUtf8ArrayListImpl(array_list: *std.ArrayList(u8), utf16le: []const u16, comptime surrogates: Surrogates) !void {
804927
// optimistically guess that it will all be ascii.
805928
try array_list.ensureTotalCapacityPrecise(utf16le.len);
@@ -1264,13 +1387,14 @@ fn formatUtf16Le(
12641387
) !void {
12651388
_ = fmt;
12661389
_ = options;
1267-
var buf: [300]u8 = undefined; // just a random size I chose
1390+
var buf: [300]u8 = undefined; // just an arbitrary size
12681391
var it = Utf16LeIterator.init(utf16le);
12691392
var u8len: usize = 0;
12701393
while (it.nextCodepoint() catch replacement_character) |codepoint| {
12711394
u8len += utf8Encode(codepoint, buf[u8len..]) catch
12721395
utf8Encode(replacement_character, buf[u8len..]) catch unreachable;
1273-
if (u8len + 3 >= buf.len) {
1396+
// make sure there's always enough room for another maximum length UTF-8 codepoint
1397+
if (u8len + 4 > buf.len) {
12741398
try writer.writeAll(buf[0..u8len]);
12751399
u8len = 0;
12761400
}
@@ -1281,7 +1405,9 @@ fn formatUtf16Le(
12811405
/// Deprecated; renamed to fmtUtf16Le
12821406
pub const fmtUtf16le = fmtUtf16Le;
12831407

1284-
/// Return a Formatter for a Utf16le string
1408+
/// Return a Formatter for a (potentially ill-formed) UTF-16 LE string,
1409+
/// which will be converted to UTF-8 during formatting.
1410+
/// Unpaired surrogates are replaced by the replacement character (U+FFFD).
12851411
pub fn fmtUtf16Le(utf16le: []const u16) std.fmt.Formatter(formatUtf16Le) {
12861412
return .{ .data = utf16le };
12871413
}

0 commit comments

Comments
 (0)