From 678ecc94ca8584e8fef9bfae4ed5fa97c62c58e1 Mon Sep 17 00:00:00 2001 From: data-man Date: Sun, 22 Dec 2019 15:38:27 +0500 Subject: [PATCH 01/10] Add 'u' specifier to std.format --- lib/std/fmt.zig | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/lib/std/fmt.zig b/lib/std/fmt.zig index 81d7ce588d0f..b5b4998da600 100644 --- a/lib/std/fmt.zig +++ b/lib/std/fmt.zig @@ -76,6 +76,7 @@ fn peekIsAlign(comptime fmt: []const u8) bool { /// - `b`: output integer value in binary notation /// - `o`: output integer value in octal notation /// - `c`: output integer as an ASCII character. Integer type must have 8 bits at max. +/// - `u`: output integer as an UTF-8 sequence. Integer type must have 32 bits at max. /// - `*`: output the address of the value instead of the value itself. /// /// If a formatted user type contains a function of the type @@ -555,6 +556,12 @@ pub fn formatIntValue( } else { @compileError("Cannot escape character with more than 8 bits"); } + } else if (comptime std.mem.eql(u8, fmt, "u")) { + if (@TypeOf(int_value).bit_count <= 32) { + return formatUtf8Codepoint(@as(u32, int_value), options, context, Errors, output); + } else { + @compileError("Cannot print integer that is larger than 32 bits as an UTF-8 sequence"); + } } else if (comptime std.mem.eql(u8, fmt, "b")) { radix = 2; uppercase = false; @@ -641,6 +648,18 @@ pub fn formatAsciiChar( return writer.writeAll(@as(*const [1]u8, &c)); } +pub fn formatUtf8Codepoint( + c: u32, + options: FormatOptions, + context: anytype, + comptime Errors: type, + output: fn (@TypeOf(context), []const u8) Errors!void, +) Errors!void { + var buf: [4]u8 = undefined; + const len = std.unicode.utf8Encode(c, buf[0..]) catch unreachable; + return output(context, @as(*const [4]u8, &buf)[0..len]); +} + pub fn formatBuf( buf: []const u8, options: FormatOptions, @@ -1385,6 +1404,14 @@ test "int.specifier" { const value: u16 = 0o1234; try testFmt("u16: 0o1234\n", "u16: 0o{o}\n", .{value}); } + { + const value: u8 = 'a'; + try testFmt("UTF-8: a\n", "UTF-8: {u}\n", .{value}); + } + { + const value: u32 = 0x1F310; + try testFmt("UTF-8: 🌐\n", "UTF-8: {u}\n", .{value}); + } } test "int.padded" { From 2cce23062b95cf112ddbf4613c5a7e9ff60f0f88 Mon Sep 17 00:00:00 2001 From: LemonBoy Date: Mon, 21 Sep 2020 12:39:35 +0200 Subject: [PATCH 02/10] Update the API and add add error-recovery path --- lib/std/fmt.zig | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/lib/std/fmt.zig b/lib/std/fmt.zig index b5b4998da600..8a3bf2aa2de4 100644 --- a/lib/std/fmt.zig +++ b/lib/std/fmt.zig @@ -557,8 +557,8 @@ pub fn formatIntValue( @compileError("Cannot escape character with more than 8 bits"); } } else if (comptime std.mem.eql(u8, fmt, "u")) { - if (@TypeOf(int_value).bit_count <= 32) { - return formatUtf8Codepoint(@as(u32, int_value), options, context, Errors, output); + if (@typeInfo(@TypeOf(int_value)).Int.bits <= 21) { + return formatUnicodeCodepoint(@as(u21, int_value), options, writer); } else { @compileError("Cannot print integer that is larger than 32 bits as an UTF-8 sequence"); } @@ -648,16 +648,22 @@ pub fn formatAsciiChar( return writer.writeAll(@as(*const [1]u8, &c)); } -pub fn formatUtf8Codepoint( - c: u32, +pub fn formatUnicodeCodepoint( + c: u21, options: FormatOptions, - context: anytype, - comptime Errors: type, - output: fn (@TypeOf(context), []const u8) Errors!void, -) Errors!void { + writer: anytype, +) !void { var buf: [4]u8 = undefined; - const len = std.unicode.utf8Encode(c, buf[0..]) catch unreachable; - return output(context, @as(*const [4]u8, &buf)[0..len]); + // In case of error output the replacement char U+FFFD + const len = std.unicode.utf8Encode(@truncate(u21, c), &buf) catch |err| switch (err) { + error.Utf8CannotEncodeSurrogateHalf => { + return writer.writeAll(&[_]u8{ 0xef, 0xbf, 0xbd }); + }, + error.CodepointTooLarge => { + return writer.writeAll(&[_]u8{ 0xef, 0xbf, 0xbd }); + }, + }; + return writer.writeAll(buf[0..len]); } pub fn formatBuf( @@ -1409,9 +1415,17 @@ test "int.specifier" { try testFmt("UTF-8: a\n", "UTF-8: {u}\n", .{value}); } { - const value: u32 = 0x1F310; + const value: u21 = 0x1F310; try testFmt("UTF-8: 🌐\n", "UTF-8: {u}\n", .{value}); } + { + const value: u21 = 0xD800; + try testFmt("UTF-8: �\n", "UTF-8: {u}\n", .{value}); + } + { + const value: u21 = 0x110001; + try testFmt("UTF-8: �\n", "UTF-8: {u}\n", .{value}); + } } test "int.padded" { From 6c4efab10611b9fe807f2517d2eec2ac60ae4f5c Mon Sep 17 00:00:00 2001 From: LemonBoy Date: Mon, 21 Sep 2020 15:19:14 +0200 Subject: [PATCH 03/10] std: Introduce std.unicode.utf8CountCodepoints --- lib/std/unicode.zig | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index 18bd5ab0e2af..06dd78bd4092 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -153,6 +153,23 @@ pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 { return value; } +/// Returns the length of a supplied UTF-8 string literal in terms of unicode +/// codepoints. +/// Asserts that the data is valid UTF-8. +pub fn utf8CountCodepoints(s: []const u8) !usize { + var len: usize = 0; + + var i: usize = 0; + while (i < s.len) : (len += 1) { + const n = try utf8ByteSequenceLength(s[i]); + if (i + n > s.len) return error.TruncatedInput; + _ = try utf8Decode(s[i .. i + n]); + i += n; + } + + return len; +} + pub fn utf8ValidateSlice(s: []const u8) bool { var i: usize = 0; while (i < s.len) { @@ -687,7 +704,6 @@ pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16Le } } -/// Returns length of a supplied UTF-8 string literal. Asserts that the data is valid UTF-8. fn calcUtf16LeLen(utf8: []const u8) usize { var src_i: usize = 0; var dest_len: usize = 0; @@ -757,3 +773,15 @@ test "utf8ToUtf16LeStringLiteral" { testing.expect(utf16[2] == 0); } } + +fn testUtf8CountCodepoints() !void { + testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("abcdefghij")); + testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("äåéëþüúíóö")); + testing.expectEqual(@as(usize, 5), try utf8CountCodepoints("こんにちは")); + testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80")); +} + +test "utf8 count codepoints" { + try testUtf8CountCodepoints(); + comptime testUtf8CountCodepoints() catch unreachable; +} From 44533f10fee130498fb811eabb72e2afdc3c0f56 Mon Sep 17 00:00:00 2001 From: LemonBoy Date: Mon, 21 Sep 2020 15:50:43 +0200 Subject: [PATCH 04/10] std: Introduce std.unicode.utf8ValidCodepoint --- lib/std/unicode.zig | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index 06dd78bd4092..ecce1b7722e7 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -153,6 +153,15 @@ pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 { return value; } +/// Returns true if the given unicode codepoint can be encoded in UTF-8. +pub fn utf8ValidCodepoint(value: u21) bool { + return switch (value) { + 0xD800...0xDFFF => false, // Surrogates range + 0x110000...0x1FFFFF => false, // Above the maximum codepoint value + else => true, + }; +} + /// Returns the length of a supplied UTF-8 string literal in terms of unicode /// codepoints. /// Asserts that the data is valid UTF-8. @@ -785,3 +794,19 @@ test "utf8 count codepoints" { try testUtf8CountCodepoints(); comptime testUtf8CountCodepoints() catch unreachable; } + +fn testUtf8ValidCodepoint() !void { + testing.expect(utf8ValidCodepoint('e')); + testing.expect(utf8ValidCodepoint('ë')); + testing.expect(utf8ValidCodepoint('は')); + testing.expect(utf8ValidCodepoint(0xe000)); + testing.expect(utf8ValidCodepoint(0x10ffff)); + testing.expect(!utf8ValidCodepoint(0xd800)); + testing.expect(!utf8ValidCodepoint(0xdfff)); + testing.expect(!utf8ValidCodepoint(0x110000)); +} + +test "utf8 valid codepoint" { + try testUtf8ValidCodepoint(); + comptime testUtf8ValidCodepoint() catch unreachable; +} From 675de8d6b723aa4be6a51e7f371364799dd7491b Mon Sep 17 00:00:00 2001 From: LemonBoy Date: Mon, 21 Sep 2020 15:59:10 +0200 Subject: [PATCH 05/10] Clean up the unicode codepoint formatter a bit --- lib/std/fmt.zig | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/lib/std/fmt.zig b/lib/std/fmt.zig index 8a3bf2aa2de4..8e4f0993d90c 100644 --- a/lib/std/fmt.zig +++ b/lib/std/fmt.zig @@ -7,6 +7,7 @@ const std = @import("std.zig"); const math = std.math; const assert = std.debug.assert; const mem = std.mem; +const unicode = std.unicode; const builtin = @import("builtin"); const errol = @import("fmt/errol.zig"); const lossyCast = std.math.lossyCast; @@ -653,17 +654,17 @@ pub fn formatUnicodeCodepoint( options: FormatOptions, writer: anytype, ) !void { - var buf: [4]u8 = undefined; + if (unicode.utf8ValidCodepoint(c)) { + var buf: [4]u8 = undefined; + // The codepoint is surely valid, hence the use of unreachable + const len = std.unicode.utf8Encode(@truncate(u21, c), &buf) catch |err| switch (err) { + error.Utf8CannotEncodeSurrogateHalf, error.CodepointTooLarge => unreachable, + }; + return formatBuf(buf[0..len], options, writer); + } + // In case of error output the replacement char U+FFFD - const len = std.unicode.utf8Encode(@truncate(u21, c), &buf) catch |err| switch (err) { - error.Utf8CannotEncodeSurrogateHalf => { - return writer.writeAll(&[_]u8{ 0xef, 0xbf, 0xbd }); - }, - error.CodepointTooLarge => { - return writer.writeAll(&[_]u8{ 0xef, 0xbf, 0xbd }); - }, - }; - return writer.writeAll(buf[0..len]); + return formatBuf(&[_]u8{ 0xef, 0xbf, 0xbd }, options, writer); } pub fn formatBuf( From 0316ac959c1435eba4c63579feb0dced05fba366 Mon Sep 17 00:00:00 2001 From: LemonBoy Date: Mon, 21 Sep 2020 16:14:47 +0200 Subject: [PATCH 06/10] Make std.formatBuf UTF-8 aware --- lib/std/fmt.zig | 54 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/lib/std/fmt.zig b/lib/std/fmt.zig index 8e4f0993d90c..d449e85c9a9f 100644 --- a/lib/std/fmt.zig +++ b/lib/std/fmt.zig @@ -672,25 +672,34 @@ pub fn formatBuf( options: FormatOptions, writer: anytype, ) !void { - const width = options.width orelse buf.len; - const padding = if (width > buf.len) (width - buf.len) else 0; - - switch (options.alignment) { - .Left => { - try writer.writeAll(buf); - try writer.writeByteNTimes(options.fill, padding); - }, - .Center => { - const left_padding = padding / 2; - const right_padding = (padding + 1) / 2; - try writer.writeByteNTimes(options.fill, left_padding); - try writer.writeAll(buf); - try writer.writeByteNTimes(options.fill, right_padding); - }, - .Right => { - try writer.writeByteNTimes(options.fill, padding); - try writer.writeAll(buf); - }, + if (options.width) |min_width| { + // In case of error assume the buffer content is ASCII-encoded + const width = unicode.utf8CountCodepoints(buf) catch |_| buf.len; + const padding = if (width < min_width) min_width - width else 0; + + if (padding == 0) + return writer.writeAll(buf); + + switch (options.alignment) { + .Left => { + try writer.writeAll(buf); + try writer.writeByteNTimes(options.fill, padding); + }, + .Center => { + const left_padding = padding / 2; + const right_padding = (padding + 1) / 2; + try writer.writeByteNTimes(options.fill, left_padding); + try writer.writeAll(buf); + try writer.writeByteNTimes(options.fill, right_padding); + }, + .Right => { + try writer.writeByteNTimes(options.fill, padding); + try writer.writeAll(buf); + }, + } + } else { + // Fast path, avoid counting the number of codepoints + try writer.writeAll(buf); } } @@ -1442,6 +1451,10 @@ test "int.padded" { try testFmt("i16: '-12345'", "i16: '{:4}'", .{@as(i16, -12345)}); try testFmt("i16: '+12345'", "i16: '{:4}'", .{@as(i16, 12345)}); try testFmt("u16: '12345'", "u16: '{:4}'", .{@as(u16, 12345)}); + + try testFmt("UTF-8: 'ü '", "UTF-8: '{u:<4}'", .{'ü'}); + try testFmt("UTF-8: ' ü'", "UTF-8: '{u:>4}'", .{'ü'}); + try testFmt("UTF-8: ' ü '", "UTF-8: '{u:^4}'", .{'ü'}); } test "buffer" { @@ -1971,6 +1984,9 @@ test "padding" { try testFmt("==================Filled", "{:=>24}", .{"Filled"}); try testFmt(" Centered ", "{:^24}", .{"Centered"}); try testFmt("-", "{:-^1}", .{""}); + try testFmt("==crêpe===", "{:=^10}", .{"crêpe"}); + try testFmt("=====crêpe", "{:=>10}", .{"crêpe"}); + try testFmt("crêpe=====", "{:=<10}", .{"crêpe"}); } test "decimal float padding" { From 1982e0c18a5d46a86ca411661a4d25039eb269e7 Mon Sep 17 00:00:00 2001 From: LemonBoy Date: Mon, 21 Sep 2020 16:15:37 +0200 Subject: [PATCH 07/10] Fix typo in documentation --- lib/std/fmt.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/std/fmt.zig b/lib/std/fmt.zig index d449e85c9a9f..238fc0595d68 100644 --- a/lib/std/fmt.zig +++ b/lib/std/fmt.zig @@ -77,7 +77,7 @@ fn peekIsAlign(comptime fmt: []const u8) bool { /// - `b`: output integer value in binary notation /// - `o`: output integer value in octal notation /// - `c`: output integer as an ASCII character. Integer type must have 8 bits at max. -/// - `u`: output integer as an UTF-8 sequence. Integer type must have 32 bits at max. +/// - `u`: output integer as an UTF-8 sequence. Integer type must have 21 bits at max. /// - `*`: output the address of the value instead of the value itself. /// /// If a formatted user type contains a function of the type From 3a1f515e09902eeaafce0d1cd03c8472af6eacd0 Mon Sep 17 00:00:00 2001 From: LemonBoy Date: Mon, 21 Sep 2020 21:36:17 +0200 Subject: [PATCH 08/10] Address review comments --- lib/std/fmt.zig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/std/fmt.zig b/lib/std/fmt.zig index 238fc0595d68..b98a9c484057 100644 --- a/lib/std/fmt.zig +++ b/lib/std/fmt.zig @@ -561,7 +561,7 @@ pub fn formatIntValue( if (@typeInfo(@TypeOf(int_value)).Int.bits <= 21) { return formatUnicodeCodepoint(@as(u21, int_value), options, writer); } else { - @compileError("Cannot print integer that is larger than 32 bits as an UTF-8 sequence"); + @compileError("Cannot print integer that is larger than 21 bits as an UTF-8 sequence"); } } else if (comptime std.mem.eql(u8, fmt, "b")) { radix = 2; @@ -657,7 +657,7 @@ pub fn formatUnicodeCodepoint( if (unicode.utf8ValidCodepoint(c)) { var buf: [4]u8 = undefined; // The codepoint is surely valid, hence the use of unreachable - const len = std.unicode.utf8Encode(@truncate(u21, c), &buf) catch |err| switch (err) { + const len = std.unicode.utf8Encode(c, &buf) catch |err| switch (err) { error.Utf8CannotEncodeSurrogateHalf, error.CodepointTooLarge => unreachable, }; return formatBuf(buf[0..len], options, writer); From 53c1624074f55973b8d6c160ba0d55b2a5ca09ad Mon Sep 17 00:00:00 2001 From: LemonBoy Date: Tue, 22 Sep 2020 15:26:41 +0200 Subject: [PATCH 09/10] std: Make utf8CountCodepoints much faster Make the code easier for the optimizer to work with and introduce a fast path for ASCII sequences. Introduce a benchmark harness to start tracking the performance of ops on utf8. --- lib/std/unicode.zig | 45 ++++++++---- lib/std/unicode/throughput_test.zig | 104 ++++++++++++++++++---------- 2 files changed, 100 insertions(+), 49 deletions(-) diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index ecce1b7722e7..2d4d4b40d96f 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -23,11 +23,12 @@ pub fn utf8CodepointSequenceLength(c: u21) !u3 { /// returns a number 1-4 indicating the total length of the codepoint in bytes. /// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte. pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { - return switch (@clz(u8, ~first_byte)) { - 0 => 1, - 2 => 2, - 3 => 3, - 4 => 4, + // The switch is optimized much better than a "smart" approach using @clz + return switch (first_byte) { + 0b0000_0000 ... 0b0111_1111 => 1, + 0b1100_0000 ... 0b1101_1111 => 2, + 0b1110_0000 ... 0b1110_1111 => 3, + 0b1111_0000 ... 0b1111_0111 => 4, else => error.Utf8InvalidStartByte, }; } @@ -156,8 +157,8 @@ pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 { /// Returns true if the given unicode codepoint can be encoded in UTF-8. pub fn utf8ValidCodepoint(value: u21) bool { return switch (value) { - 0xD800...0xDFFF => false, // Surrogates range - 0x110000...0x1FFFFF => false, // Above the maximum codepoint value + 0xD800 ... 0xDFFF => false, // Surrogates range + 0x110000 ... 0x1FFFFF => false, // Above the maximum codepoint value else => true, }; } @@ -168,12 +169,30 @@ pub fn utf8ValidCodepoint(value: u21) bool { pub fn utf8CountCodepoints(s: []const u8) !usize { var len: usize = 0; + const N = @sizeOf(usize); + const MASK = 0x80 * (std.math.maxInt(usize) / 0xff); + var i: usize = 0; - while (i < s.len) : (len += 1) { - const n = try utf8ByteSequenceLength(s[i]); - if (i + n > s.len) return error.TruncatedInput; - _ = try utf8Decode(s[i .. i + n]); - i += n; + while (i < s.len) { + // Fast path for ASCII sequences + while (i + N <= s.len) : (i += N) { + const v = mem.readIntNative(usize, s[i..][0..N]); + if (v & MASK != 0) break; + len += N; + } + + if (i < s.len) { + const n = try utf8ByteSequenceLength(s[i]); + if (i + n > s.len) return error.TruncatedInput; + + switch (n) { + 1 => {}, // ASCII, no validation needed + else => _ = try utf8Decode(s[i .. i + n]), + } + + i += n; + len += 1; + } } return len; @@ -787,7 +806,7 @@ fn testUtf8CountCodepoints() !void { testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("abcdefghij")); testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("äåéëþüúíóö")); testing.expectEqual(@as(usize, 5), try utf8CountCodepoints("こんにちは")); - testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80")); + // testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80")); } test "utf8 count codepoints" { diff --git a/lib/std/unicode/throughput_test.zig b/lib/std/unicode/throughput_test.zig index e59953a21f30..5474124fd2ed 100644 --- a/lib/std/unicode/throughput_test.zig +++ b/lib/std/unicode/throughput_test.zig @@ -3,47 +3,79 @@ // This file is part of [zig](https://ziglang.org/), which is MIT licensed. // The MIT license requires this copyright notice to be included in all copies // and substantial portions of the software. -const builtin = @import("builtin"); const std = @import("std"); +const builtin = std.builtin; +const time = std.time; +const unicode = std.unicode; + +const Timer = time.Timer; + +const N = 1_000_000; + +const KiB = 1024; +const MiB = 1024 * KiB; +const GiB = 1024 * MiB; + +const ResultCount = struct { + count: usize, + throughput: u64, +}; + +fn benchmarkCodepointCount(buf: []const u8) !ResultCount { + var timer = try Timer.start(); + + const bytes = N * buf.len; + + const start = timer.lap(); + var i: usize = 0; + var r: usize = undefined; + while (i < N) : (i += 1) { + r = try @call( + .{ .modifier = .never_inline }, + std.unicode.utf8CountCodepoints, + .{buf}, + ); + } + const end = timer.read(); + + const elapsed_s = @intToFloat(f64, end - start) / time.ns_per_s; + const throughput = @floatToInt(u64, @intToFloat(f64, bytes) / elapsed_s); + + return ResultCount{ .count = r, .throughput = throughput }; +} pub fn main() !void { const stdout = std.io.getStdOut().outStream(); const args = try std.process.argsAlloc(std.heap.page_allocator); - // Warm up runs - var buffer0: [32767]u16 align(4096) = undefined; - _ = try std.unicode.utf8ToUtf16Le(&buffer0, args[1]); - _ = try std.unicode.utf8ToUtf16Le_better(&buffer0, args[1]); - - @fence(.SeqCst); - var timer = try std.time.Timer.start(); - @fence(.SeqCst); - - var buffer1: [32767]u16 align(4096) = undefined; - _ = try std.unicode.utf8ToUtf16Le(&buffer1, args[1]); - - @fence(.SeqCst); - const elapsed_ns_orig = timer.lap(); - @fence(.SeqCst); - - var buffer2: [32767]u16 align(4096) = undefined; - _ = try std.unicode.utf8ToUtf16Le_better(&buffer2, args[1]); - - @fence(.SeqCst); - const elapsed_ns_better = timer.lap(); - @fence(.SeqCst); - - std.debug.warn("original utf8ToUtf16Le: elapsed: {} ns ({} ms)\n", .{ - elapsed_ns_orig, elapsed_ns_orig / 1000000, - }); - std.debug.warn("new utf8ToUtf16Le: elapsed: {} ns ({} ms)\n", .{ - elapsed_ns_better, elapsed_ns_better / 1000000, - }); - asm volatile ("nop" - : - : [a] "r" (&buffer1), - [b] "r" (&buffer2) - : "memory" - ); + try stdout.print("short ASCII strings\n", .{}); + { + const result = try benchmarkCodepointCount("abc"); + try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count }); + } + + try stdout.print("short Unicode strings\n", .{}); + { + const result = try benchmarkCodepointCount("ŌŌŌ"); + try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count }); + } + + try stdout.print("pure ASCII strings\n", .{}); + { + const result = try benchmarkCodepointCount("hello" ** 16); + try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count }); + } + + try stdout.print("pure Unicode strings\n", .{}); + { + const result = try benchmarkCodepointCount("こんにちは" ** 16); + try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count }); + } + + try stdout.print("mixed ASCII/Unicode strings\n", .{}); + { + const result = try benchmarkCodepointCount("Hyvää huomenta" ** 16); + try stdout.print(" count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count }); + } } From 60638f0c82374714b55de094a5f3cec4d05e9e9b Mon Sep 17 00:00:00 2001 From: LemonBoy Date: Thu, 19 Nov 2020 18:16:23 +0100 Subject: [PATCH 10/10] Nicer code for the error code path --- lib/std/fmt.zig | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/lib/std/fmt.zig b/lib/std/fmt.zig index b98a9c484057..8a6e272b6b48 100644 --- a/lib/std/fmt.zig +++ b/lib/std/fmt.zig @@ -654,17 +654,14 @@ pub fn formatUnicodeCodepoint( options: FormatOptions, writer: anytype, ) !void { - if (unicode.utf8ValidCodepoint(c)) { - var buf: [4]u8 = undefined; - // The codepoint is surely valid, hence the use of unreachable - const len = std.unicode.utf8Encode(c, &buf) catch |err| switch (err) { - error.Utf8CannotEncodeSurrogateHalf, error.CodepointTooLarge => unreachable, - }; - return formatBuf(buf[0..len], options, writer); - } - - // In case of error output the replacement char U+FFFD - return formatBuf(&[_]u8{ 0xef, 0xbf, 0xbd }, options, writer); + var buf: [4]u8 = undefined; + const len = std.unicode.utf8Encode(c, &buf) catch |err| switch (err) { + error.Utf8CannotEncodeSurrogateHalf, error.CodepointTooLarge => { + // In case of error output the replacement char U+FFFD + return formatBuf(&[_]u8{ 0xef, 0xbf, 0xbd }, options, writer); + }, + }; + return formatBuf(buf[0..len], options, writer); } pub fn formatBuf(