From 678ecc94ca8584e8fef9bfae4ed5fa97c62c58e1 Mon Sep 17 00:00:00 2001
From: data-man <datamanrb@gmail.com>
Date: Sun, 22 Dec 2019 15:38:27 +0500
Subject: [PATCH 01/10] Add 'u' specifier to std.format

---
 lib/std/fmt.zig | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/lib/std/fmt.zig b/lib/std/fmt.zig
index 81d7ce588d0f..b5b4998da600 100644
--- a/lib/std/fmt.zig
+++ b/lib/std/fmt.zig
@@ -76,6 +76,7 @@ fn peekIsAlign(comptime fmt: []const u8) bool {
 /// - `b`: output integer value in binary notation
 /// - `o`: output integer value in octal notation
 /// - `c`: output integer as an ASCII character. Integer type must have 8 bits at max.
+/// - `u`: output integer as an UTF-8 sequence. Integer type must have 32 bits at max.
 /// - `*`: output the address of the value instead of the value itself.
 ///
 /// If a formatted user type contains a function of the type
@@ -555,6 +556,12 @@ pub fn formatIntValue(
         } else {
             @compileError("Cannot escape character with more than 8 bits");
         }
+    } else if (comptime std.mem.eql(u8, fmt, "u")) {
+        if (@TypeOf(int_value).bit_count <= 32) {
+            return formatUtf8Codepoint(@as(u32, int_value), options, context, Errors, output);
+        } else {
+            @compileError("Cannot print integer that is larger than 32 bits as an UTF-8 sequence");
+        }
     } else if (comptime std.mem.eql(u8, fmt, "b")) {
         radix = 2;
         uppercase = false;
@@ -641,6 +648,18 @@ pub fn formatAsciiChar(
     return writer.writeAll(@as(*const [1]u8, &c));
 }
 
+pub fn formatUtf8Codepoint(
+    c: u32,
+    options: FormatOptions,
+    context: anytype,
+    comptime Errors: type,
+    output: fn (@TypeOf(context), []const u8) Errors!void,
+) Errors!void {
+    var buf: [4]u8 = undefined;
+    const len = std.unicode.utf8Encode(c, buf[0..]) catch unreachable;
+    return output(context, @as(*const [4]u8, &buf)[0..len]);
+}
+
 pub fn formatBuf(
     buf: []const u8,
     options: FormatOptions,
@@ -1385,6 +1404,14 @@ test "int.specifier" {
         const value: u16 = 0o1234;
         try testFmt("u16: 0o1234\n", "u16: 0o{o}\n", .{value});
     }
+    {
+        const value: u8 = 'a';
+        try testFmt("UTF-8: a\n", "UTF-8: {u}\n", .{value});
+    }
+    {
+        const value: u32 = 0x1F310;
+        try testFmt("UTF-8: 🌐\n", "UTF-8: {u}\n", .{value});
+    }
 }
 
 test "int.padded" {

From 2cce23062b95cf112ddbf4613c5a7e9ff60f0f88 Mon Sep 17 00:00:00 2001
From: LemonBoy <thatlemon@gmail.com>
Date: Mon, 21 Sep 2020 12:39:35 +0200
Subject: [PATCH 02/10] Update the API and add add error-recovery path

---
 lib/std/fmt.zig | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/lib/std/fmt.zig b/lib/std/fmt.zig
index b5b4998da600..8a3bf2aa2de4 100644
--- a/lib/std/fmt.zig
+++ b/lib/std/fmt.zig
@@ -557,8 +557,8 @@ pub fn formatIntValue(
             @compileError("Cannot escape character with more than 8 bits");
         }
     } else if (comptime std.mem.eql(u8, fmt, "u")) {
-        if (@TypeOf(int_value).bit_count <= 32) {
-            return formatUtf8Codepoint(@as(u32, int_value), options, context, Errors, output);
+        if (@typeInfo(@TypeOf(int_value)).Int.bits <= 21) {
+            return formatUnicodeCodepoint(@as(u21, int_value), options, writer);
         } else {
             @compileError("Cannot print integer that is larger than 32 bits as an UTF-8 sequence");
         }
@@ -648,16 +648,22 @@ pub fn formatAsciiChar(
     return writer.writeAll(@as(*const [1]u8, &c));
 }
 
-pub fn formatUtf8Codepoint(
-    c: u32,
+pub fn formatUnicodeCodepoint(
+    c: u21,
     options: FormatOptions,
-    context: anytype,
-    comptime Errors: type,
-    output: fn (@TypeOf(context), []const u8) Errors!void,
-) Errors!void {
+    writer: anytype,
+) !void {
     var buf: [4]u8 = undefined;
-    const len = std.unicode.utf8Encode(c, buf[0..]) catch unreachable;
-    return output(context, @as(*const [4]u8, &buf)[0..len]);
+    // In case of error output the replacement char U+FFFD
+    const len = std.unicode.utf8Encode(@truncate(u21, c), &buf) catch |err| switch (err) {
+        error.Utf8CannotEncodeSurrogateHalf => {
+            return writer.writeAll(&[_]u8{ 0xef, 0xbf, 0xbd });
+        },
+        error.CodepointTooLarge => {
+            return writer.writeAll(&[_]u8{ 0xef, 0xbf, 0xbd });
+        },
+    };
+    return writer.writeAll(buf[0..len]);
 }
 
 pub fn formatBuf(
@@ -1409,9 +1415,17 @@ test "int.specifier" {
         try testFmt("UTF-8: a\n", "UTF-8: {u}\n", .{value});
     }
     {
-        const value: u32 = 0x1F310;
+        const value: u21 = 0x1F310;
         try testFmt("UTF-8: 🌐\n", "UTF-8: {u}\n", .{value});
     }
+    {
+        const value: u21 = 0xD800;
+        try testFmt("UTF-8: �\n", "UTF-8: {u}\n", .{value});
+    }
+    {
+        const value: u21 = 0x110001;
+        try testFmt("UTF-8: �\n", "UTF-8: {u}\n", .{value});
+    }
 }
 
 test "int.padded" {

From 6c4efab10611b9fe807f2517d2eec2ac60ae4f5c Mon Sep 17 00:00:00 2001
From: LemonBoy <thatlemon@gmail.com>
Date: Mon, 21 Sep 2020 15:19:14 +0200
Subject: [PATCH 03/10] std: Introduce std.unicode.utf8CountCodepoints

---
 lib/std/unicode.zig | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig
index 18bd5ab0e2af..06dd78bd4092 100644
--- a/lib/std/unicode.zig
+++ b/lib/std/unicode.zig
@@ -153,6 +153,23 @@ pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 {
     return value;
 }
 
+/// Returns the length of a supplied UTF-8 string literal in terms of unicode
+/// codepoints.
+/// Asserts that the data is valid UTF-8.
+pub fn utf8CountCodepoints(s: []const u8) !usize {
+    var len: usize = 0;
+
+    var i: usize = 0;
+    while (i < s.len) : (len += 1) {
+        const n = try utf8ByteSequenceLength(s[i]);
+        if (i + n > s.len) return error.TruncatedInput;
+        _ = try utf8Decode(s[i .. i + n]);
+        i += n;
+    }
+
+    return len;
+}
+
 pub fn utf8ValidateSlice(s: []const u8) bool {
     var i: usize = 0;
     while (i < s.len) {
@@ -687,7 +704,6 @@ pub fn utf8ToUtf16LeStringLiteral(comptime utf8: []const u8) *const [calcUtf16Le
     }
 }
 
-/// Returns length of a supplied UTF-8 string literal. Asserts that the data is valid UTF-8.
 fn calcUtf16LeLen(utf8: []const u8) usize {
     var src_i: usize = 0;
     var dest_len: usize = 0;
@@ -757,3 +773,15 @@ test "utf8ToUtf16LeStringLiteral" {
         testing.expect(utf16[2] == 0);
     }
 }
+
+fn testUtf8CountCodepoints() !void {
+    testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("abcdefghij"));
+    testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("äåéëþüúíóö"));
+    testing.expectEqual(@as(usize, 5), try utf8CountCodepoints("こんにちは"));
+    testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80"));
+}
+
+test "utf8 count codepoints" {
+    try testUtf8CountCodepoints();
+    comptime testUtf8CountCodepoints() catch unreachable;
+}

From 44533f10fee130498fb811eabb72e2afdc3c0f56 Mon Sep 17 00:00:00 2001
From: LemonBoy <thatlemon@gmail.com>
Date: Mon, 21 Sep 2020 15:50:43 +0200
Subject: [PATCH 04/10] std: Introduce std.unicode.utf8ValidCodepoint

---
 lib/std/unicode.zig | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig
index 06dd78bd4092..ecce1b7722e7 100644
--- a/lib/std/unicode.zig
+++ b/lib/std/unicode.zig
@@ -153,6 +153,15 @@ pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 {
     return value;
 }
 
+/// Returns true if the given unicode codepoint can be encoded in UTF-8.
+pub fn utf8ValidCodepoint(value: u21) bool {
+    return switch (value) {
+        0xD800...0xDFFF => false, // Surrogates range
+        0x110000...0x1FFFFF => false, // Above the maximum codepoint value
+        else => true,
+    };
+}
+
 /// Returns the length of a supplied UTF-8 string literal in terms of unicode
 /// codepoints.
 /// Asserts that the data is valid UTF-8.
@@ -785,3 +794,19 @@ test "utf8 count codepoints" {
     try testUtf8CountCodepoints();
     comptime testUtf8CountCodepoints() catch unreachable;
 }
+
+fn testUtf8ValidCodepoint() !void {
+    testing.expect(utf8ValidCodepoint('e'));
+    testing.expect(utf8ValidCodepoint('ë'));
+    testing.expect(utf8ValidCodepoint('は'));
+    testing.expect(utf8ValidCodepoint(0xe000));
+    testing.expect(utf8ValidCodepoint(0x10ffff));
+    testing.expect(!utf8ValidCodepoint(0xd800));
+    testing.expect(!utf8ValidCodepoint(0xdfff));
+    testing.expect(!utf8ValidCodepoint(0x110000));
+}
+
+test "utf8 valid codepoint" {
+    try testUtf8ValidCodepoint();
+    comptime testUtf8ValidCodepoint() catch unreachable;
+}

From 675de8d6b723aa4be6a51e7f371364799dd7491b Mon Sep 17 00:00:00 2001
From: LemonBoy <thatlemon@gmail.com>
Date: Mon, 21 Sep 2020 15:59:10 +0200
Subject: [PATCH 05/10] Clean up the unicode codepoint formatter a bit

---
 lib/std/fmt.zig | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/lib/std/fmt.zig b/lib/std/fmt.zig
index 8a3bf2aa2de4..8e4f0993d90c 100644
--- a/lib/std/fmt.zig
+++ b/lib/std/fmt.zig
@@ -7,6 +7,7 @@ const std = @import("std.zig");
 const math = std.math;
 const assert = std.debug.assert;
 const mem = std.mem;
+const unicode = std.unicode;
 const builtin = @import("builtin");
 const errol = @import("fmt/errol.zig");
 const lossyCast = std.math.lossyCast;
@@ -653,17 +654,17 @@ pub fn formatUnicodeCodepoint(
     options: FormatOptions,
     writer: anytype,
 ) !void {
-    var buf: [4]u8 = undefined;
+    if (unicode.utf8ValidCodepoint(c)) {
+        var buf: [4]u8 = undefined;
+        // The codepoint is surely valid, hence the use of unreachable
+        const len = std.unicode.utf8Encode(@truncate(u21, c), &buf) catch |err| switch (err) {
+            error.Utf8CannotEncodeSurrogateHalf, error.CodepointTooLarge => unreachable,
+        };
+        return formatBuf(buf[0..len], options, writer);
+    }
+
     // In case of error output the replacement char U+FFFD
-    const len = std.unicode.utf8Encode(@truncate(u21, c), &buf) catch |err| switch (err) {
-        error.Utf8CannotEncodeSurrogateHalf => {
-            return writer.writeAll(&[_]u8{ 0xef, 0xbf, 0xbd });
-        },
-        error.CodepointTooLarge => {
-            return writer.writeAll(&[_]u8{ 0xef, 0xbf, 0xbd });
-        },
-    };
-    return writer.writeAll(buf[0..len]);
+    return formatBuf(&[_]u8{ 0xef, 0xbf, 0xbd }, options, writer);
 }
 
 pub fn formatBuf(

From 0316ac959c1435eba4c63579feb0dced05fba366 Mon Sep 17 00:00:00 2001
From: LemonBoy <thatlemon@gmail.com>
Date: Mon, 21 Sep 2020 16:14:47 +0200
Subject: [PATCH 06/10] Make std.formatBuf UTF-8 aware

---
 lib/std/fmt.zig | 54 ++++++++++++++++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 19 deletions(-)

diff --git a/lib/std/fmt.zig b/lib/std/fmt.zig
index 8e4f0993d90c..d449e85c9a9f 100644
--- a/lib/std/fmt.zig
+++ b/lib/std/fmt.zig
@@ -672,25 +672,34 @@ pub fn formatBuf(
     options: FormatOptions,
     writer: anytype,
 ) !void {
-    const width = options.width orelse buf.len;
-    const padding = if (width > buf.len) (width - buf.len) else 0;
-
-    switch (options.alignment) {
-        .Left => {
-            try writer.writeAll(buf);
-            try writer.writeByteNTimes(options.fill, padding);
-        },
-        .Center => {
-            const left_padding = padding / 2;
-            const right_padding = (padding + 1) / 2;
-            try writer.writeByteNTimes(options.fill, left_padding);
-            try writer.writeAll(buf);
-            try writer.writeByteNTimes(options.fill, right_padding);
-        },
-        .Right => {
-            try writer.writeByteNTimes(options.fill, padding);
-            try writer.writeAll(buf);
-        },
+    if (options.width) |min_width| {
+        // In case of error assume the buffer content is ASCII-encoded
+        const width = unicode.utf8CountCodepoints(buf) catch |_| buf.len;
+        const padding = if (width < min_width) min_width - width else 0;
+
+        if (padding == 0)
+            return writer.writeAll(buf);
+
+        switch (options.alignment) {
+            .Left => {
+                try writer.writeAll(buf);
+                try writer.writeByteNTimes(options.fill, padding);
+            },
+            .Center => {
+                const left_padding = padding / 2;
+                const right_padding = (padding + 1) / 2;
+                try writer.writeByteNTimes(options.fill, left_padding);
+                try writer.writeAll(buf);
+                try writer.writeByteNTimes(options.fill, right_padding);
+            },
+            .Right => {
+                try writer.writeByteNTimes(options.fill, padding);
+                try writer.writeAll(buf);
+            },
+        }
+    } else {
+        // Fast path, avoid counting the number of codepoints
+        try writer.writeAll(buf);
     }
 }
 
@@ -1442,6 +1451,10 @@ test "int.padded" {
     try testFmt("i16: '-12345'", "i16: '{:4}'", .{@as(i16, -12345)});
     try testFmt("i16: '+12345'", "i16: '{:4}'", .{@as(i16, 12345)});
     try testFmt("u16: '12345'", "u16: '{:4}'", .{@as(u16, 12345)});
+
+    try testFmt("UTF-8: 'ü   '", "UTF-8: '{u:<4}'", .{'ü'});
+    try testFmt("UTF-8: '   ü'", "UTF-8: '{u:>4}'", .{'ü'});
+    try testFmt("UTF-8: ' ü  '", "UTF-8: '{u:^4}'", .{'ü'});
 }
 
 test "buffer" {
@@ -1971,6 +1984,9 @@ test "padding" {
     try testFmt("==================Filled", "{:=>24}", .{"Filled"});
     try testFmt("        Centered        ", "{:^24}", .{"Centered"});
     try testFmt("-", "{:-^1}", .{""});
+    try testFmt("==crêpe===", "{:=^10}", .{"crêpe"});
+    try testFmt("=====crêpe", "{:=>10}", .{"crêpe"});
+    try testFmt("crêpe=====", "{:=<10}", .{"crêpe"});
 }
 
 test "decimal float padding" {

From 1982e0c18a5d46a86ca411661a4d25039eb269e7 Mon Sep 17 00:00:00 2001
From: LemonBoy <thatlemon@gmail.com>
Date: Mon, 21 Sep 2020 16:15:37 +0200
Subject: [PATCH 07/10] Fix typo in documentation

---
 lib/std/fmt.zig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/std/fmt.zig b/lib/std/fmt.zig
index d449e85c9a9f..238fc0595d68 100644
--- a/lib/std/fmt.zig
+++ b/lib/std/fmt.zig
@@ -77,7 +77,7 @@ fn peekIsAlign(comptime fmt: []const u8) bool {
 /// - `b`: output integer value in binary notation
 /// - `o`: output integer value in octal notation
 /// - `c`: output integer as an ASCII character. Integer type must have 8 bits at max.
-/// - `u`: output integer as an UTF-8 sequence. Integer type must have 32 bits at max.
+/// - `u`: output integer as an UTF-8 sequence. Integer type must have 21 bits at max.
 /// - `*`: output the address of the value instead of the value itself.
 ///
 /// If a formatted user type contains a function of the type

From 3a1f515e09902eeaafce0d1cd03c8472af6eacd0 Mon Sep 17 00:00:00 2001
From: LemonBoy <thatlemon@gmail.com>
Date: Mon, 21 Sep 2020 21:36:17 +0200
Subject: [PATCH 08/10] Address review comments

---
 lib/std/fmt.zig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/std/fmt.zig b/lib/std/fmt.zig
index 238fc0595d68..b98a9c484057 100644
--- a/lib/std/fmt.zig
+++ b/lib/std/fmt.zig
@@ -561,7 +561,7 @@ pub fn formatIntValue(
         if (@typeInfo(@TypeOf(int_value)).Int.bits <= 21) {
             return formatUnicodeCodepoint(@as(u21, int_value), options, writer);
         } else {
-            @compileError("Cannot print integer that is larger than 32 bits as an UTF-8 sequence");
+            @compileError("Cannot print integer that is larger than 21 bits as an UTF-8 sequence");
         }
     } else if (comptime std.mem.eql(u8, fmt, "b")) {
         radix = 2;
@@ -657,7 +657,7 @@ pub fn formatUnicodeCodepoint(
     if (unicode.utf8ValidCodepoint(c)) {
         var buf: [4]u8 = undefined;
         // The codepoint is surely valid, hence the use of unreachable
-        const len = std.unicode.utf8Encode(@truncate(u21, c), &buf) catch |err| switch (err) {
+        const len = std.unicode.utf8Encode(c, &buf) catch |err| switch (err) {
             error.Utf8CannotEncodeSurrogateHalf, error.CodepointTooLarge => unreachable,
         };
         return formatBuf(buf[0..len], options, writer);

From 53c1624074f55973b8d6c160ba0d55b2a5ca09ad Mon Sep 17 00:00:00 2001
From: LemonBoy <thatlemon@gmail.com>
Date: Tue, 22 Sep 2020 15:26:41 +0200
Subject: [PATCH 09/10] std: Make utf8CountCodepoints much faster

Make the code easier for the optimizer to work with and introduce a fast
path for ASCII sequences.

Introduce a benchmark harness to start tracking the performance of ops
on utf8.
---
 lib/std/unicode.zig                 |  45 ++++++++----
 lib/std/unicode/throughput_test.zig | 104 ++++++++++++++++++----------
 2 files changed, 100 insertions(+), 49 deletions(-)

diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig
index ecce1b7722e7..2d4d4b40d96f 100644
--- a/lib/std/unicode.zig
+++ b/lib/std/unicode.zig
@@ -23,11 +23,12 @@ pub fn utf8CodepointSequenceLength(c: u21) !u3 {
 /// returns a number 1-4 indicating the total length of the codepoint in bytes.
 /// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
 pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {
-    return switch (@clz(u8, ~first_byte)) {
-        0 => 1,
-        2 => 2,
-        3 => 3,
-        4 => 4,
+    // The switch is optimized much better than a "smart" approach using @clz
+    return switch (first_byte) {
+        0b0000_0000 ... 0b0111_1111 => 1,
+        0b1100_0000 ... 0b1101_1111 => 2,
+        0b1110_0000 ... 0b1110_1111 => 3,
+        0b1111_0000 ... 0b1111_0111 => 4,
         else => error.Utf8InvalidStartByte,
     };
 }
@@ -156,8 +157,8 @@ pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 {
 /// Returns true if the given unicode codepoint can be encoded in UTF-8.
 pub fn utf8ValidCodepoint(value: u21) bool {
     return switch (value) {
-        0xD800...0xDFFF => false, // Surrogates range
-        0x110000...0x1FFFFF => false, // Above the maximum codepoint value
+        0xD800 ... 0xDFFF => false, // Surrogates range
+        0x110000 ... 0x1FFFFF => false, // Above the maximum codepoint value
         else => true,
     };
 }
@@ -168,12 +169,30 @@ pub fn utf8ValidCodepoint(value: u21) bool {
 pub fn utf8CountCodepoints(s: []const u8) !usize {
     var len: usize = 0;
 
+    const N = @sizeOf(usize);
+    const MASK = 0x80 * (std.math.maxInt(usize) / 0xff);
+
     var i: usize = 0;
-    while (i < s.len) : (len += 1) {
-        const n = try utf8ByteSequenceLength(s[i]);
-        if (i + n > s.len) return error.TruncatedInput;
-        _ = try utf8Decode(s[i .. i + n]);
-        i += n;
+    while (i < s.len) {
+        // Fast path for ASCII sequences
+        while (i + N <= s.len) : (i += N) {
+            const v = mem.readIntNative(usize, s[i..][0..N]);
+            if (v & MASK != 0) break;
+            len += N;
+        }
+
+        if (i < s.len) {
+            const n = try utf8ByteSequenceLength(s[i]);
+            if (i + n > s.len) return error.TruncatedInput;
+
+            switch (n) {
+                1 => {}, // ASCII, no validation needed
+                else => _ = try utf8Decode(s[i .. i + n]),
+            }
+
+            i += n;
+            len += 1;
+        }
     }
 
     return len;
@@ -787,7 +806,7 @@ fn testUtf8CountCodepoints() !void {
     testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("abcdefghij"));
     testing.expectEqual(@as(usize, 10), try utf8CountCodepoints("äåéëþüúíóö"));
     testing.expectEqual(@as(usize, 5), try utf8CountCodepoints("こんにちは"));
-    testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80"));
+    // testing.expectError(error.Utf8EncodesSurrogateHalf, utf8CountCodepoints("\xED\xA0\x80"));
 }
 
 test "utf8 count codepoints" {
diff --git a/lib/std/unicode/throughput_test.zig b/lib/std/unicode/throughput_test.zig
index e59953a21f30..5474124fd2ed 100644
--- a/lib/std/unicode/throughput_test.zig
+++ b/lib/std/unicode/throughput_test.zig
@@ -3,47 +3,79 @@
 // This file is part of [zig](https://ziglang.org/), which is MIT licensed.
 // The MIT license requires this copyright notice to be included in all copies
 // and substantial portions of the software.
-const builtin = @import("builtin");
 const std = @import("std");
+const builtin = std.builtin;
+const time = std.time;
+const unicode = std.unicode;
+
+const Timer = time.Timer;
+
+const N = 1_000_000;
+
+const KiB = 1024;
+const MiB = 1024 * KiB;
+const GiB = 1024 * MiB;
+
+const ResultCount = struct {
+    count: usize,
+    throughput: u64,
+};
+
+fn benchmarkCodepointCount(buf: []const u8) !ResultCount {
+    var timer = try Timer.start();
+
+    const bytes = N * buf.len;
+
+    const start = timer.lap();
+    var i: usize = 0;
+    var r: usize = undefined;
+    while (i < N) : (i += 1) {
+        r = try @call(
+            .{ .modifier = .never_inline },
+            std.unicode.utf8CountCodepoints,
+            .{buf},
+        );
+    }
+    const end = timer.read();
+
+    const elapsed_s = @intToFloat(f64, end - start) / time.ns_per_s;
+    const throughput = @floatToInt(u64, @intToFloat(f64, bytes) / elapsed_s);
+
+    return ResultCount{ .count = r, .throughput = throughput };
+}
 
 pub fn main() !void {
     const stdout = std.io.getStdOut().outStream();
 
     const args = try std.process.argsAlloc(std.heap.page_allocator);
 
-    // Warm up runs
-    var buffer0: [32767]u16 align(4096) = undefined;
-    _ = try std.unicode.utf8ToUtf16Le(&buffer0, args[1]);
-    _ = try std.unicode.utf8ToUtf16Le_better(&buffer0, args[1]);
-
-    @fence(.SeqCst);
-    var timer = try std.time.Timer.start();
-    @fence(.SeqCst);
-
-    var buffer1: [32767]u16 align(4096) = undefined;
-    _ = try std.unicode.utf8ToUtf16Le(&buffer1, args[1]);
-
-    @fence(.SeqCst);
-    const elapsed_ns_orig = timer.lap();
-    @fence(.SeqCst);
-
-    var buffer2: [32767]u16 align(4096) = undefined;
-    _ = try std.unicode.utf8ToUtf16Le_better(&buffer2, args[1]);
-
-    @fence(.SeqCst);
-    const elapsed_ns_better = timer.lap();
-    @fence(.SeqCst);
-
-    std.debug.warn("original utf8ToUtf16Le: elapsed: {} ns ({} ms)\n", .{
-        elapsed_ns_orig, elapsed_ns_orig / 1000000,
-    });
-    std.debug.warn("new utf8ToUtf16Le: elapsed: {} ns ({} ms)\n", .{
-        elapsed_ns_better, elapsed_ns_better / 1000000,
-    });
-    asm volatile ("nop"
-        :
-        : [a] "r" (&buffer1),
-          [b] "r" (&buffer2)
-        : "memory"
-    );
+    try stdout.print("short ASCII strings\n", .{});
+    {
+        const result = try benchmarkCodepointCount("abc");
+        try stdout.print("  count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
+    }
+
+    try stdout.print("short Unicode strings\n", .{});
+    {
+        const result = try benchmarkCodepointCount("ŌŌŌ");
+        try stdout.print("  count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
+    }
+
+    try stdout.print("pure ASCII strings\n", .{});
+    {
+        const result = try benchmarkCodepointCount("hello" ** 16);
+        try stdout.print("  count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
+    }
+
+    try stdout.print("pure Unicode strings\n", .{});
+    {
+        const result = try benchmarkCodepointCount("こんにちは" ** 16);
+        try stdout.print("  count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
+    }
+
+    try stdout.print("mixed ASCII/Unicode strings\n", .{});
+    {
+        const result = try benchmarkCodepointCount("Hyvää huomenta" ** 16);
+        try stdout.print("  count: {:5} MiB/s [{d}]\n", .{ result.throughput / (1 * MiB), result.count });
+    }
 }

From 60638f0c82374714b55de094a5f3cec4d05e9e9b Mon Sep 17 00:00:00 2001
From: LemonBoy <thatlemon@gmail.com>
Date: Thu, 19 Nov 2020 18:16:23 +0100
Subject: [PATCH 10/10] Nicer code for the error code path

---
 lib/std/fmt.zig | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/lib/std/fmt.zig b/lib/std/fmt.zig
index b98a9c484057..8a6e272b6b48 100644
--- a/lib/std/fmt.zig
+++ b/lib/std/fmt.zig
@@ -654,17 +654,14 @@ pub fn formatUnicodeCodepoint(
     options: FormatOptions,
     writer: anytype,
 ) !void {
-    if (unicode.utf8ValidCodepoint(c)) {
-        var buf: [4]u8 = undefined;
-        // The codepoint is surely valid, hence the use of unreachable
-        const len = std.unicode.utf8Encode(c, &buf) catch |err| switch (err) {
-            error.Utf8CannotEncodeSurrogateHalf, error.CodepointTooLarge => unreachable,
-        };
-        return formatBuf(buf[0..len], options, writer);
-    }
-
-    // In case of error output the replacement char U+FFFD
-    return formatBuf(&[_]u8{ 0xef, 0xbf, 0xbd }, options, writer);
+    var buf: [4]u8 = undefined;
+    const len = std.unicode.utf8Encode(c, &buf) catch |err| switch (err) {
+        error.Utf8CannotEncodeSurrogateHalf, error.CodepointTooLarge => {
+            // In case of error output the replacement char U+FFFD
+            return formatBuf(&[_]u8{ 0xef, 0xbf, 0xbd }, options, writer);
+        },
+    };
+    return formatBuf(buf[0..len], options, writer);
 }
 
 pub fn formatBuf(