@@ -800,6 +800,129 @@ fn testDecode(bytes: []const u8) !u21 {
800
800
return utf8Decode (bytes );
801
801
}
802
802
803
+ /// Print the given `utf8` string, encoded as UTF-8 bytes.
804
+ /// Ill-formed UTF-8 byte sequences are replaced by the replacement character (U+FFFD)
805
+ /// according to "U+FFFD Substitution of Maximal Subparts" from Chapter 3 of
806
+ /// the Unicode standard, and as specified by https://encoding.spec.whatwg.org/#utf-8-decoder
807
+ fn formatUtf8 (
808
+ utf8 : []const u8 ,
809
+ comptime fmt : []const u8 ,
810
+ options : std.fmt.FormatOptions ,
811
+ writer : anytype ,
812
+ ) ! void {
813
+ _ = fmt ;
814
+ _ = options ;
815
+ var buf : [300 ]u8 = undefined ; // just an arbitrary size
816
+ var u8len : usize = 0 ;
817
+
818
+ // This implementation is based on this specification:
819
+ // https://encoding.spec.whatwg.org/#utf-8-decoder
820
+ var codepoint : u21 = 0 ;
821
+ var cont_bytes_seen : u3 = 0 ;
822
+ var cont_bytes_needed : u3 = 0 ;
823
+ var lower_boundary : u8 = 0x80 ;
824
+ var upper_boundary : u8 = 0xBF ;
825
+
826
+ var i : usize = 0 ;
827
+ while (i < utf8 .len ) {
828
+ const byte = utf8 [i ];
829
+ if (cont_bytes_needed == 0 ) {
830
+ switch (byte ) {
831
+ 0x00... 0x7F = > {
832
+ buf [u8len ] = byte ;
833
+ u8len += 1 ;
834
+ },
835
+ 0xC2... 0xDF = > {
836
+ cont_bytes_needed = 1 ;
837
+ codepoint = byte & 0b00011111 ;
838
+ },
839
+ 0xE0... 0xEF = > {
840
+ if (byte == 0xE0 ) lower_boundary = 0xA0 ;
841
+ if (byte == 0xED ) upper_boundary = 0x9F ;
842
+ cont_bytes_needed = 2 ;
843
+ codepoint = byte & 0b00001111 ;
844
+ },
845
+ 0xF0... 0xF4 = > {
846
+ if (byte == 0xF0 ) lower_boundary = 0x90 ;
847
+ if (byte == 0xF4 ) upper_boundary = 0x8F ;
848
+ cont_bytes_needed = 3 ;
849
+ codepoint = byte & 0b00000111 ;
850
+ },
851
+ else = > {
852
+ u8len += utf8Encode (replacement_character , buf [u8len .. ]) catch unreachable ;
853
+ },
854
+ }
855
+ // consume the byte
856
+ i += 1 ;
857
+ } else if (byte < lower_boundary or byte > upper_boundary ) {
858
+ codepoint = 0 ;
859
+ cont_bytes_needed = 0 ;
860
+ cont_bytes_seen = 0 ;
861
+ lower_boundary = 0x80 ;
862
+ upper_boundary = 0xBF ;
863
+ u8len += utf8Encode (replacement_character , buf [u8len .. ]) catch unreachable ;
864
+ // do not consume the current byte, it should now be treated as a possible start byte
865
+ } else {
866
+ lower_boundary = 0x80 ;
867
+ upper_boundary = 0xBF ;
868
+ codepoint <<= 6 ;
869
+ codepoint |= byte & 0b00111111 ;
870
+ cont_bytes_seen += 1 ;
871
+ // consume the byte
872
+ i += 1 ;
873
+
874
+ if (cont_bytes_seen == cont_bytes_needed ) {
875
+ const codepoint_len = cont_bytes_seen + 1 ;
876
+ const codepoint_start_i = i - codepoint_len ;
877
+ @memcpy (buf [u8len .. ][0.. codepoint_len ], utf8 [codepoint_start_i .. ][0.. codepoint_len ]);
878
+ u8len += codepoint_len ;
879
+
880
+ codepoint = 0 ;
881
+ cont_bytes_needed = 0 ;
882
+ cont_bytes_seen = 0 ;
883
+ }
884
+ }
885
+ // make sure there's always enough room for another maximum length UTF-8 codepoint
886
+ if (u8len + 4 > buf .len ) {
887
+ try writer .writeAll (buf [0.. u8len ]);
888
+ u8len = 0 ;
889
+ }
890
+ }
891
+ if (cont_bytes_needed != 0 ) {
892
+ // we know there's enough room because we always flush
893
+ // if there's less than 4 bytes remaining in the buffer.
894
+ u8len += utf8Encode (replacement_character , buf [u8len .. ]) catch unreachable ;
895
+ }
896
+ try writer .writeAll (buf [0.. u8len ]);
897
+ }
898
+
899
+ /// Return a Formatter for a (potentially ill-formed) UTF-8 string.
900
+ /// Ill-formed UTF-8 byte sequences are replaced by the replacement character (U+FFFD)
901
+ /// according to "U+FFFD Substitution of Maximal Subparts" from Chapter 3 of
902
+ /// the Unicode standard, and as specified by https://encoding.spec.whatwg.org/#utf-8-decoder
903
+ pub fn fmtUtf8 (utf8 : []const u8 ) std.fmt.Formatter (formatUtf8 ) {
904
+ return .{ .data = utf8 };
905
+ }
906
+
907
+ test "fmtUtf8" {
908
+ const expectFmt = testing .expectFmt ;
909
+ try expectFmt ("" , "{}" , .{fmtUtf8 ("" )});
910
+ try expectFmt ("foo" , "{}" , .{fmtUtf8 ("foo" )});
911
+ try expectFmt ("𐐷" , "{}" , .{fmtUtf8 ("𐐷" )});
912
+
913
+ // Table 3-8. U+FFFD for Non-Shortest Form Sequences
914
+ try expectFmt ("��������A" , "{}" , .{fmtUtf8 ("\xC0\xAF\xE0\x80\xBF\xF0\x81\x82 A" )});
915
+
916
+ // Table 3-9. U+FFFD for Ill-Formed Sequences for Surrogates
917
+ try expectFmt ("��������A" , "{}" , .{fmtUtf8 ("\xED\xA0\x80\xED\xBF\xBF\xED\xAF A" )});
918
+
919
+ // Table 3-10. U+FFFD for Other Ill-Formed Sequences
920
+ try expectFmt ("�����A��B" , "{}" , .{fmtUtf8 ("\xF4\x91\x92\x93\xFF A\x80\xBF B" )});
921
+
922
+ // Table 3-11. U+FFFD for Truncated Sequences
923
+ try expectFmt ("����A" , "{}" , .{fmtUtf8 ("\xE1\x80\xE2\xF0\x91\x92\xF1\xBF A" )});
924
+ }
925
+
803
926
fn utf16LeToUtf8ArrayListImpl (array_list : * std .ArrayList (u8 ), utf16le : []const u16 , comptime surrogates : Surrogates ) ! void {
804
927
// optimistically guess that it will all be ascii.
805
928
try array_list .ensureTotalCapacityPrecise (utf16le .len );
@@ -1264,13 +1387,14 @@ fn formatUtf16Le(
1264
1387
) ! void {
1265
1388
_ = fmt ;
1266
1389
_ = options ;
1267
- var buf : [300 ]u8 = undefined ; // just a random size I chose
1390
+ var buf : [300 ]u8 = undefined ; // just an arbitrary size
1268
1391
var it = Utf16LeIterator .init (utf16le );
1269
1392
var u8len : usize = 0 ;
1270
1393
while (it .nextCodepoint () catch replacement_character ) | codepoint | {
1271
1394
u8len += utf8Encode (codepoint , buf [u8len .. ]) catch
1272
1395
utf8Encode (replacement_character , buf [u8len .. ]) catch unreachable ;
1273
- if (u8len + 3 >= buf .len ) {
1396
+ // make sure there's always enough room for another maximum length UTF-8 codepoint
1397
+ if (u8len + 4 > buf .len ) {
1274
1398
try writer .writeAll (buf [0.. u8len ]);
1275
1399
u8len = 0 ;
1276
1400
}
@@ -1281,7 +1405,9 @@ fn formatUtf16Le(
1281
1405
/// Deprecated; renamed to fmtUtf16Le
1282
1406
pub const fmtUtf16le = fmtUtf16Le ;
1283
1407
1284
- /// Return a Formatter for a Utf16le string
1408
+ /// Return a Formatter for a (potentially ill-formed) UTF-16 LE string,
1409
+ /// which will be converted to UTF-8 during formatting.
1410
+ /// Unpaired surrogates are replaced by the replacement character (U+FFFD).
1285
1411
pub fn fmtUtf16Le (utf16le : []const u16 ) std.fmt.Formatter (formatUtf16Le ) {
1286
1412
return .{ .data = utf16le };
1287
1413
}
0 commit comments