Skip to content

std.fmt meets UTF-8 #6390

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Nov 19, 2020
93 changes: 74 additions & 19 deletions lib/std/fmt.zig
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ const std = @import("std.zig");
const math = std.math;
const assert = std.debug.assert;
const mem = std.mem;
const unicode = std.unicode;
const builtin = @import("builtin");
const errol = @import("fmt/errol.zig");
const lossyCast = std.math.lossyCast;
Expand Down Expand Up @@ -76,6 +77,7 @@ fn peekIsAlign(comptime fmt: []const u8) bool {
/// - `b`: output integer value in binary notation
/// - `o`: output integer value in octal notation
/// - `c`: output integer as an ASCII character. Integer type must have 8 bits at max.
/// - `u`: output integer as an UTF-8 sequence. Integer type must have 21 bits at max.
/// - `*`: output the address of the value instead of the value itself.
///
/// If a formatted user type contains a function of the type
Expand Down Expand Up @@ -555,6 +557,12 @@ pub fn formatIntValue(
} else {
@compileError("Cannot escape character with more than 8 bits");
}
} else if (comptime std.mem.eql(u8, fmt, "u")) {
if (@typeInfo(@TypeOf(int_value)).Int.bits <= 21) {
return formatUnicodeCodepoint(@as(u21, int_value), options, writer);
} else {
@compileError("Cannot print integer that is larger than 21 bits as an UTF-8 sequence");
}
} else if (comptime std.mem.eql(u8, fmt, "b")) {
radix = 2;
uppercase = false;
Expand Down Expand Up @@ -641,30 +649,54 @@ pub fn formatAsciiChar(
return writer.writeAll(@as(*const [1]u8, &c));
}

pub fn formatUnicodeCodepoint(
c: u21,
options: FormatOptions,
writer: anytype,
) !void {
var buf: [4]u8 = undefined;
const len = std.unicode.utf8Encode(c, &buf) catch |err| switch (err) {
error.Utf8CannotEncodeSurrogateHalf, error.CodepointTooLarge => {
// In case of error output the replacement char U+FFFD
return formatBuf(&[_]u8{ 0xef, 0xbf, 0xbd }, options, writer);
},
};
return formatBuf(buf[0..len], options, writer);
}

pub fn formatBuf(
buf: []const u8,
options: FormatOptions,
writer: anytype,
) !void {
const width = options.width orelse buf.len;
const padding = if (width > buf.len) (width - buf.len) else 0;

switch (options.alignment) {
.Left => {
try writer.writeAll(buf);
try writer.writeByteNTimes(options.fill, padding);
},
.Center => {
const left_padding = padding / 2;
const right_padding = (padding + 1) / 2;
try writer.writeByteNTimes(options.fill, left_padding);
try writer.writeAll(buf);
try writer.writeByteNTimes(options.fill, right_padding);
},
.Right => {
try writer.writeByteNTimes(options.fill, padding);
try writer.writeAll(buf);
},
if (options.width) |min_width| {
// In case of error assume the buffer content is ASCII-encoded
const width = unicode.utf8CountCodepoints(buf) catch |_| buf.len;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This won't decide the string width! A codepoint is not a single character. Example:
"👩‍👦‍👦" is U+1F469 U+200D U+1F466 U+200D U+1F466, which has 5 codepoints, but only width 1

You can look that up with this tool: https://cryptii.com/pipes/unicode-lookup

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This won't decide the string width!

That's a good approximation of the string width, the same approximation used by other PLs.
Entering the wcwidth territory and dealing with tables needing constant updates or mismatches between the producer (Zig, in this case) and the consumer (the terminal emulator/editor/browser) is definitely not something that I'd rank high on my todo list.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Surely it would be nice if a user could put a table in the root source file and std.unicode apis could use it

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the best solution is to implement the runtime width specifier (see #1358) and let the user specify the display width, I'm playing with a prototype of this idea and it looks promising.

const padding = if (width < min_width) min_width - width else 0;

if (padding == 0)
return writer.writeAll(buf);

switch (options.alignment) {
.Left => {
try writer.writeAll(buf);
try writer.writeByteNTimes(options.fill, padding);
},
.Center => {
const left_padding = padding / 2;
const right_padding = (padding + 1) / 2;
try writer.writeByteNTimes(options.fill, left_padding);
try writer.writeAll(buf);
try writer.writeByteNTimes(options.fill, right_padding);
},
.Right => {
try writer.writeByteNTimes(options.fill, padding);
try writer.writeAll(buf);
},
}
} else {
// Fast path, avoid counting the number of codepoints
try writer.writeAll(buf);
}
}

Expand Down Expand Up @@ -1385,6 +1417,22 @@ test "int.specifier" {
const value: u16 = 0o1234;
try testFmt("u16: 0o1234\n", "u16: 0o{o}\n", .{value});
}
{
const value: u8 = 'a';
try testFmt("UTF-8: a\n", "UTF-8: {u}\n", .{value});
}
{
const value: u21 = 0x1F310;
try testFmt("UTF-8: 🌐\n", "UTF-8: {u}\n", .{value});
}
{
const value: u21 = 0xD800;
try testFmt("UTF-8: �\n", "UTF-8: {u}\n", .{value});
}
{
const value: u21 = 0x110001;
try testFmt("UTF-8: �\n", "UTF-8: {u}\n", .{value});
}
}

test "int.padded" {
Expand All @@ -1400,6 +1448,10 @@ test "int.padded" {
try testFmt("i16: '-12345'", "i16: '{:4}'", .{@as(i16, -12345)});
try testFmt("i16: '+12345'", "i16: '{:4}'", .{@as(i16, 12345)});
try testFmt("u16: '12345'", "u16: '{:4}'", .{@as(u16, 12345)});

try testFmt("UTF-8: 'ü '", "UTF-8: '{u:<4}'", .{'ü'});
try testFmt("UTF-8: ' ü'", "UTF-8: '{u:>4}'", .{'ü'});
try testFmt("UTF-8: ' ü '", "UTF-8: '{u:^4}'", .{'ü'});
}

test "buffer" {
Expand Down Expand Up @@ -1929,6 +1981,9 @@ test "padding" {
try testFmt("==================Filled", "{:=>24}", .{"Filled"});
try testFmt(" Centered ", "{:^24}", .{"Centered"});
try testFmt("-", "{:-^1}", .{""});
try testFmt("==crêpe===", "{:=^10}", .{"crêpe"});
try testFmt("=====crêpe", "{:=>10}", .{"crêpe"});
try testFmt("crêpe=====", "{:=<10}", .{"crêpe"});
}

test "decimal float padding" {
Expand Down
84 changes: 78 additions & 6 deletions lib/std/unicode.zig
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,12 @@ pub fn utf8CodepointSequenceLength(c: u21) !u3 {
/// returns a number 1-4 indicating the total length of the codepoint in bytes.
/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {
return switch (@clz(u8, ~first_byte)) {
0 => 1,
2 => 2,
3 => 3,
4 => 4,
// The switch is optimized much better than a "smart" approach using @clz
return switch (first_byte) {
0b0000_0000 ... 0b0111_1111 => 1,
0b1100_0000 ... 0b1101_1111 => 2,
0b1110_0000 ... 0b1110_1111 => 3,
0b1111_0000 ... 0b1111_0111 => 4,
else => error.Utf8InvalidStartByte,
};
}
Expand Down Expand Up @@ -153,6 +154,50 @@ pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 {
return value;
}

/// Returns true if the given unicode codepoint can be encoded in UTF-8.
pub fn utf8ValidCodepoint(value: u21) bool {
return switch (value) {
0xD800 ... 0xDFFF => false, // Surrogates range
0x110000 ... 0x1FFFFF => false, // Above the maximum codepoint value
else => true,
};
}

/// Returns the length of a supplied UTF-8 string literal in terms of unicode
/// codepoints.
/// Asserts that the data is valid UTF-8.
pub fn utf8CountCodepoints(s: []const u8) !usize {
var len: usize = 0;

const N = @sizeOf(usize);
const MASK = 0x80 * (std.math.maxInt(usize) / 0xff);

var i: usize = 0;
while (i < s.len) {
// Fast path for ASCII sequences
while (i + N <= s.len) : (i += N) {
const v = mem.readIntNative(usize, s[i..][0..N]);
if (v & MASK != 0) break;
len += N;
}

if (i < s.len) {
const n = try utf8ByteSequenceLength(s[i]);
if (i + n > s.len) return error.TruncatedInput;

switch (n) {
1 => {}, // ASCII, no validation needed
else => _ = try utf8Decode(s[i .. i + n]),
}

i += n;
len += 1;
}
}

return len;
}

pub fn utf8ValidateSlice(s: []const u8) bool {
var i: usize = 0;
while (i < s.len) {
Expand Down Expand Up @@ -687,7 +732,6 @@ pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16Le
}
}

/// Returns length of a supplied UTF-8 string literal. Asserts that the data is valid UTF-8.
fn calcUtf16LeLen(utf8: []const u8) usize {
var src_i: usize = 0;
var dest_len: usize = 0;
Expand Down Expand Up @@ -757,3 +801,31 @@ test "utf8ToUtf16LeStringLiteral" {
testing.expect(utf16[2] == 0);
}
}

fn testUtf8CountCodepoints() !void {
testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("abcdefghij"));
testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("äåéëþüúíóö"));
testing.expectEqual(@as(usize, 5), try utf8CountCodepoints("こんにちは"));
// testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80"));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commented out code?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My bad, I'll add it back

}

test "utf8 count codepoints" {
try testUtf8CountCodepoints();
comptime testUtf8CountCodepoints() catch unreachable;
}

fn testUtf8ValidCodepoint() !void {
testing.expect(utf8ValidCodepoint('e'));
testing.expect(utf8ValidCodepoint('ë'));
testing.expect(utf8ValidCodepoint('は'));
testing.expect(utf8ValidCodepoint(0xe000));
testing.expect(utf8ValidCodepoint(0x10ffff));
testing.expect(!utf8ValidCodepoint(0xd800));
testing.expect(!utf8ValidCodepoint(0xdfff));
testing.expect(!utf8ValidCodepoint(0x110000));
}

test "utf8 valid codepoint" {
try testUtf8ValidCodepoint();
comptime testUtf8ValidCodepoint() catch unreachable;
}
104 changes: 68 additions & 36 deletions lib/std/unicode/throughput_test.zig
Original file line number Diff line number Diff line change
Expand Up @@ -3,47 +3,79 @@
// This file is part of [zig](https://ziglang.org/), which is MIT licensed.
// The MIT license requires this copyright notice to be included in all copies
// and substantial portions of the software.
const builtin = @import("builtin");
const std = @import("std");
const builtin = std.builtin;
const time = std.time;
const unicode = std.unicode;

const Timer = time.Timer;

const N = 1_000_000;

const KiB = 1024;
const MiB = 1024 * KiB;
const GiB = 1024 * MiB;

const ResultCount = struct {
count: usize,
throughput: u64,
};

fn benchmarkCodepointCount(buf: []const u8) !ResultCount {
var timer = try Timer.start();

const bytes = N * buf.len;

const start = timer.lap();
var i: usize = 0;
var r: usize = undefined;
while (i < N) : (i += 1) {
r = try @call(
.{ .modifier = .never_inline },
std.unicode.utf8CountCodepoints,
.{buf},
);
}
const end = timer.read();

const elapsed_s = @intToFloat(f64, end - start) / time.ns_per_s;
const throughput = @floatToInt(u64, @intToFloat(f64, bytes) / elapsed_s);

return ResultCount{ .count = r, .throughput = throughput };
}

pub fn main() !void {
const stdout = std.io.getStdOut().outStream();

const args = try std.process.argsAlloc(std.heap.page_allocator);

// Warm up runs
var buffer0: [32767]u16 align(4096) = undefined;
_ = try std.unicode.utf8ToUtf16Le(&buffer0, args[1]);
_ = try std.unicode.utf8ToUtf16Le_better(&buffer0, args[1]);

@fence(.SeqCst);
var timer = try std.time.Timer.start();
@fence(.SeqCst);

var buffer1: [32767]u16 align(4096) = undefined;
_ = try std.unicode.utf8ToUtf16Le(&buffer1, args[1]);

@fence(.SeqCst);
const elapsed_ns_orig = timer.lap();
@fence(.SeqCst);

var buffer2: [32767]u16 align(4096) = undefined;
_ = try std.unicode.utf8ToUtf16Le_better(&buffer2, args[1]);

@fence(.SeqCst);
const elapsed_ns_better = timer.lap();
@fence(.SeqCst);

std.debug.warn("original utf8ToUtf16Le: elapsed: {} ns ({} ms)\n", .{
elapsed_ns_orig, elapsed_ns_orig / 1000000,
});
std.debug.warn("new utf8ToUtf16Le: elapsed: {} ns ({} ms)\n", .{
elapsed_ns_better, elapsed_ns_better / 1000000,
});
asm volatile ("nop"
:
: [a] "r" (&buffer1),
[b] "r" (&buffer2)
: "memory"
);
try stdout.print("short ASCII strings\n", .{});
{
const result = try benchmarkCodepointCount("abc");
try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
}

try stdout.print("short Unicode strings\n", .{});
{
const result = try benchmarkCodepointCount("ŌŌŌ");
try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
}

try stdout.print("pure ASCII strings\n", .{});
{
const result = try benchmarkCodepointCount("hello" ** 16);
try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
}

try stdout.print("pure Unicode strings\n", .{});
{
const result = try benchmarkCodepointCount("こんにちは" ** 16);
try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
}

try stdout.print("mixed ASCII/Unicode strings\n", .{});
{
const result = try benchmarkCodepointCount("Hyvää huomenta" ** 16);
try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
}
}