Skip to content

Commit 473cb1f

Browse files
authored
Merge pull request #6390 from LemonBoy/reboot-3970
std.fmt meets UTF-8
2 parents d526b0f + 60638f0 commit 473cb1f

File tree

3 files changed

+220
-61
lines changed

3 files changed

+220
-61
lines changed

lib/std/fmt.zig

Lines changed: 74 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ const std = @import("std.zig");
77
const math = std.math;
88
const assert = std.debug.assert;
99
const mem = std.mem;
10+
const unicode = std.unicode;
1011
const builtin = @import("builtin");
1112
const errol = @import("fmt/errol.zig");
1213
const lossyCast = std.math.lossyCast;
@@ -76,6 +77,7 @@ fn peekIsAlign(comptime fmt: []const u8) bool {
7677
/// - `b`: output integer value in binary notation
7778
/// - `o`: output integer value in octal notation
7879
/// - `c`: output integer as an ASCII character. Integer type must have 8 bits at max.
80+
/// - `u`: output integer as an UTF-8 sequence. Integer type must have 21 bits at max.
7981
/// - `*`: output the address of the value instead of the value itself.
8082
///
8183
/// If a formatted user type contains a function of the type
@@ -555,6 +557,12 @@ pub fn formatIntValue(
555557
} else {
556558
@compileError("Cannot escape character with more than 8 bits");
557559
}
560+
} else if (comptime std.mem.eql(u8, fmt, "u")) {
561+
if (@typeInfo(@TypeOf(int_value)).Int.bits <= 21) {
562+
return formatUnicodeCodepoint(@as(u21, int_value), options, writer);
563+
} else {
564+
@compileError("Cannot print integer that is larger than 21 bits as an UTF-8 sequence");
565+
}
558566
} else if (comptime std.mem.eql(u8, fmt, "b")) {
559567
radix = 2;
560568
uppercase = false;
@@ -641,30 +649,54 @@ pub fn formatAsciiChar(
641649
return writer.writeAll(@as(*const [1]u8, &c));
642650
}
643651

652+
pub fn formatUnicodeCodepoint(
653+
c: u21,
654+
options: FormatOptions,
655+
writer: anytype,
656+
) !void {
657+
var buf: [4]u8 = undefined;
658+
const len = std.unicode.utf8Encode(c, &buf) catch |err| switch (err) {
659+
error.Utf8CannotEncodeSurrogateHalf, error.CodepointTooLarge => {
660+
// In case of error output the replacement char U+FFFD
661+
return formatBuf(&[_]u8{ 0xef, 0xbf, 0xbd }, options, writer);
662+
},
663+
};
664+
return formatBuf(buf[0..len], options, writer);
665+
}
666+
644667
pub fn formatBuf(
645668
buf: []const u8,
646669
options: FormatOptions,
647670
writer: anytype,
648671
) !void {
649-
const width = options.width orelse buf.len;
650-
const padding = if (width > buf.len) (width - buf.len) else 0;
651-
652-
switch (options.alignment) {
653-
.Left => {
654-
try writer.writeAll(buf);
655-
try writer.writeByteNTimes(options.fill, padding);
656-
},
657-
.Center => {
658-
const left_padding = padding / 2;
659-
const right_padding = (padding + 1) / 2;
660-
try writer.writeByteNTimes(options.fill, left_padding);
661-
try writer.writeAll(buf);
662-
try writer.writeByteNTimes(options.fill, right_padding);
663-
},
664-
.Right => {
665-
try writer.writeByteNTimes(options.fill, padding);
666-
try writer.writeAll(buf);
667-
},
672+
if (options.width) |min_width| {
673+
// In case of error assume the buffer content is ASCII-encoded
674+
const width = unicode.utf8CountCodepoints(buf) catch |_| buf.len;
675+
const padding = if (width < min_width) min_width - width else 0;
676+
677+
if (padding == 0)
678+
return writer.writeAll(buf);
679+
680+
switch (options.alignment) {
681+
.Left => {
682+
try writer.writeAll(buf);
683+
try writer.writeByteNTimes(options.fill, padding);
684+
},
685+
.Center => {
686+
const left_padding = padding / 2;
687+
const right_padding = (padding + 1) / 2;
688+
try writer.writeByteNTimes(options.fill, left_padding);
689+
try writer.writeAll(buf);
690+
try writer.writeByteNTimes(options.fill, right_padding);
691+
},
692+
.Right => {
693+
try writer.writeByteNTimes(options.fill, padding);
694+
try writer.writeAll(buf);
695+
},
696+
}
697+
} else {
698+
// Fast path, avoid counting the number of codepoints
699+
try writer.writeAll(buf);
668700
}
669701
}
670702

@@ -1385,6 +1417,22 @@ test "int.specifier" {
13851417
const value: u16 = 0o1234;
13861418
try testFmt("u16: 0o1234\n", "u16: 0o{o}\n", .{value});
13871419
}
1420+
{
1421+
const value: u8 = 'a';
1422+
try testFmt("UTF-8: a\n", "UTF-8: {u}\n", .{value});
1423+
}
1424+
{
1425+
const value: u21 = 0x1F310;
1426+
try testFmt("UTF-8: 🌐\n", "UTF-8: {u}\n", .{value});
1427+
}
1428+
{
1429+
const value: u21 = 0xD800;
1430+
try testFmt("UTF-8: �\n", "UTF-8: {u}\n", .{value});
1431+
}
1432+
{
1433+
const value: u21 = 0x110001;
1434+
try testFmt("UTF-8: �\n", "UTF-8: {u}\n", .{value});
1435+
}
13881436
}
13891437

13901438
test "int.padded" {
@@ -1400,6 +1448,10 @@ test "int.padded" {
14001448
try testFmt("i16: '-12345'", "i16: '{:4}'", .{@as(i16, -12345)});
14011449
try testFmt("i16: '+12345'", "i16: '{:4}'", .{@as(i16, 12345)});
14021450
try testFmt("u16: '12345'", "u16: '{:4}'", .{@as(u16, 12345)});
1451+
1452+
try testFmt("UTF-8: 'ü '", "UTF-8: '{u:<4}'", .{'ü'});
1453+
try testFmt("UTF-8: ' ü'", "UTF-8: '{u:>4}'", .{'ü'});
1454+
try testFmt("UTF-8: ' ü '", "UTF-8: '{u:^4}'", .{'ü'});
14031455
}
14041456

14051457
test "buffer" {
@@ -1929,6 +1981,9 @@ test "padding" {
19291981
try testFmt("==================Filled", "{:=>24}", .{"Filled"});
19301982
try testFmt(" Centered ", "{:^24}", .{"Centered"});
19311983
try testFmt("-", "{:-^1}", .{""});
1984+
try testFmt("==crêpe===", "{:=^10}", .{"crêpe"});
1985+
try testFmt("=====crêpe", "{:=>10}", .{"crêpe"});
1986+
try testFmt("crêpe=====", "{:=<10}", .{"crêpe"});
19321987
}
19331988

19341989
test "decimal float padding" {

lib/std/unicode.zig

Lines changed: 78 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,12 @@ pub fn utf8CodepointSequenceLength(c: u21) !u3 {
2323
/// returns a number 1-4 indicating the total length of the codepoint in bytes.
2424
/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
2525
pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {
26-
return switch (@clz(u8, ~first_byte)) {
27-
0 => 1,
28-
2 => 2,
29-
3 => 3,
30-
4 => 4,
26+
// The switch is optimized much better than a "smart" approach using @clz
27+
return switch (first_byte) {
28+
0b0000_0000 ... 0b0111_1111 => 1,
29+
0b1100_0000 ... 0b1101_1111 => 2,
30+
0b1110_0000 ... 0b1110_1111 => 3,
31+
0b1111_0000 ... 0b1111_0111 => 4,
3132
else => error.Utf8InvalidStartByte,
3233
};
3334
}
@@ -153,6 +154,50 @@ pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 {
153154
return value;
154155
}
155156

157+
/// Returns true if the given unicode codepoint can be encoded in UTF-8.
158+
pub fn utf8ValidCodepoint(value: u21) bool {
159+
return switch (value) {
160+
0xD800 ... 0xDFFF => false, // Surrogates range
161+
0x110000 ... 0x1FFFFF => false, // Above the maximum codepoint value
162+
else => true,
163+
};
164+
}
165+
166+
/// Returns the length of a supplied UTF-8 string literal in terms of unicode
167+
/// codepoints.
168+
/// Asserts that the data is valid UTF-8.
169+
pub fn utf8CountCodepoints(s: []const u8) !usize {
170+
var len: usize = 0;
171+
172+
const N = @sizeOf(usize);
173+
const MASK = 0x80 * (std.math.maxInt(usize) / 0xff);
174+
175+
var i: usize = 0;
176+
while (i < s.len) {
177+
// Fast path for ASCII sequences
178+
while (i + N <= s.len) : (i += N) {
179+
const v = mem.readIntNative(usize, s[i..][0..N]);
180+
if (v & MASK != 0) break;
181+
len += N;
182+
}
183+
184+
if (i < s.len) {
185+
const n = try utf8ByteSequenceLength(s[i]);
186+
if (i + n > s.len) return error.TruncatedInput;
187+
188+
switch (n) {
189+
1 => {}, // ASCII, no validation needed
190+
else => _ = try utf8Decode(s[i .. i + n]),
191+
}
192+
193+
i += n;
194+
len += 1;
195+
}
196+
}
197+
198+
return len;
199+
}
200+
156201
pub fn utf8ValidateSlice(s: []const u8) bool {
157202
var i: usize = 0;
158203
while (i < s.len) {
@@ -687,7 +732,6 @@ pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16Le
687732
}
688733
}
689734

690-
/// Returns length of a supplied UTF-8 string literal. Asserts that the data is valid UTF-8.
691735
fn calcUtf16LeLen(utf8: []const u8) usize {
692736
var src_i: usize = 0;
693737
var dest_len: usize = 0;
@@ -757,3 +801,31 @@ test "utf8ToUtf16LeStringLiteral" {
757801
testing.expect(utf16[2] == 0);
758802
}
759803
}
804+
805+
fn testUtf8CountCodepoints() !void {
806+
testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("abcdefghij"));
807+
testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("äåéëþüúíóö"));
808+
testing.expectEqual(@as(usize, 5), try utf8CountCodepoints("こんにちは"));
809+
// testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80"));
810+
}
811+
812+
test "utf8 count codepoints" {
813+
try testUtf8CountCodepoints();
814+
comptime testUtf8CountCodepoints() catch unreachable;
815+
}
816+
817+
fn testUtf8ValidCodepoint() !void {
818+
testing.expect(utf8ValidCodepoint('e'));
819+
testing.expect(utf8ValidCodepoint('ë'));
820+
testing.expect(utf8ValidCodepoint('は'));
821+
testing.expect(utf8ValidCodepoint(0xe000));
822+
testing.expect(utf8ValidCodepoint(0x10ffff));
823+
testing.expect(!utf8ValidCodepoint(0xd800));
824+
testing.expect(!utf8ValidCodepoint(0xdfff));
825+
testing.expect(!utf8ValidCodepoint(0x110000));
826+
}
827+
828+
test "utf8 valid codepoint" {
829+
try testUtf8ValidCodepoint();
830+
comptime testUtf8ValidCodepoint() catch unreachable;
831+
}

lib/std/unicode/throughput_test.zig

Lines changed: 68 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3,47 +3,79 @@
33
// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
44
// The MIT license requires this copyright notice to be included in all copies
55
// and substantial portions of the software.
6-
const builtin = @import("builtin");
76
const std = @import("std");
7+
const builtin = std.builtin;
8+
const time = std.time;
9+
const unicode = std.unicode;
10+
11+
const Timer = time.Timer;
12+
13+
const N = 1_000_000;
14+
15+
const KiB = 1024;
16+
const MiB = 1024 * KiB;
17+
const GiB = 1024 * MiB;
18+
19+
const ResultCount = struct {
20+
count: usize,
21+
throughput: u64,
22+
};
23+
24+
fn benchmarkCodepointCount(buf: []const u8) !ResultCount {
25+
var timer = try Timer.start();
26+
27+
const bytes = N * buf.len;
28+
29+
const start = timer.lap();
30+
var i: usize = 0;
31+
var r: usize = undefined;
32+
while (i < N) : (i += 1) {
33+
r = try @call(
34+
.{ .modifier = .never_inline },
35+
std.unicode.utf8CountCodepoints,
36+
.{buf},
37+
);
38+
}
39+
const end = timer.read();
40+
41+
const elapsed_s = @intToFloat(f64, end - start) / time.ns_per_s;
42+
const throughput = @floatToInt(u64, @intToFloat(f64, bytes) / elapsed_s);
43+
44+
return ResultCount{ .count = r, .throughput = throughput };
45+
}
846

947
pub fn main() !void {
1048
const stdout = std.io.getStdOut().outStream();
1149

1250
const args = try std.process.argsAlloc(std.heap.page_allocator);
1351

14-
// Warm up runs
15-
var buffer0: [32767]u16 align(4096) = undefined;
16-
_ = try std.unicode.utf8ToUtf16Le(&buffer0, args[1]);
17-
_ = try std.unicode.utf8ToUtf16Le_better(&buffer0, args[1]);
18-
19-
@fence(.SeqCst);
20-
var timer = try std.time.Timer.start();
21-
@fence(.SeqCst);
22-
23-
var buffer1: [32767]u16 align(4096) = undefined;
24-
_ = try std.unicode.utf8ToUtf16Le(&buffer1, args[1]);
25-
26-
@fence(.SeqCst);
27-
const elapsed_ns_orig = timer.lap();
28-
@fence(.SeqCst);
29-
30-
var buffer2: [32767]u16 align(4096) = undefined;
31-
_ = try std.unicode.utf8ToUtf16Le_better(&buffer2, args[1]);
32-
33-
@fence(.SeqCst);
34-
const elapsed_ns_better = timer.lap();
35-
@fence(.SeqCst);
36-
37-
std.debug.warn("original utf8ToUtf16Le: elapsed: {} ns ({} ms)\n", .{
38-
elapsed_ns_orig, elapsed_ns_orig / 1000000,
39-
});
40-
std.debug.warn("new utf8ToUtf16Le: elapsed: {} ns ({} ms)\n", .{
41-
elapsed_ns_better, elapsed_ns_better / 1000000,
42-
});
43-
asm volatile ("nop"
44-
:
45-
: [a] "r" (&buffer1),
46-
[b] "r" (&buffer2)
47-
: "memory"
48-
);
52+
try stdout.print("short ASCII strings\n", .{});
53+
{
54+
const result = try benchmarkCodepointCount("abc");
55+
try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
56+
}
57+
58+
try stdout.print("short Unicode strings\n", .{});
59+
{
60+
const result = try benchmarkCodepointCount("ŌŌŌ");
61+
try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
62+
}
63+
64+
try stdout.print("pure ASCII strings\n", .{});
65+
{
66+
const result = try benchmarkCodepointCount("hello" ** 16);
67+
try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
68+
}
69+
70+
try stdout.print("pure Unicode strings\n", .{});
71+
{
72+
const result = try benchmarkCodepointCount("こんにちは" ** 16);
73+
try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
74+
}
75+
76+
try stdout.print("mixed ASCII/Unicode strings\n", .{});
77+
{
78+
const result = try benchmarkCodepointCount("Hyvää huomenta" ** 16);
79+
try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
80+
}
4981
}

0 commit comments

Comments
 (0)