From 4f75f10f85dae8ec9dc34f5f253d405ca85253ea Mon Sep 17 00:00:00 2001
From: Shawn Landden <shawn@git.icu>
Date: Sun, 7 Jun 2020 00:57:45 +0400
Subject: [PATCH] Unicode (utf-8) rework

* Make the errors clearer. (the return error types are not set because that
  reveals bugs in stage1)
* Make decode and length same operation
* Give location of invalid character
* Always report errors in Utf8View
---
 lib/std/json.zig    |  13 +-
 lib/std/process.zig |   2 +-
 lib/std/unicode.zig | 360 +++++++++++++++++++++++---------------------
 3 files changed, 194 insertions(+), 181 deletions(-)

diff --git a/lib/std/json.zig b/lib/std/json.zig
index eeceeac8a7f0..ba81e712cc63 100644
--- a/lib/std/json.zig
+++ b/lib/std/json.zig
@@ -2099,7 +2099,7 @@ fn unescapeString(output: []u8, input: []const u8) !void {
                 inIndex += 6;
             } else |err| {
                 // it might be a surrogate pair
-                if (err != error.Utf8CannotEncodeSurrogateHalf) {
+                if (err != error.UnicodeSurrogateHalf) {
                     return error.InvalidUnicodeHexSymbol;
                 }
                 // check if a second code unit is present
@@ -2532,15 +2532,16 @@ pub fn stringify(
                             '\r' => try out_stream.writeAll("\\r"),
                             '\t' => try out_stream.writeAll("\\t"),
                             else => {
-                                const ulen = std.unicode.utf8ByteSequenceLength(value[i]) catch unreachable;
                                 // control characters (only things left with 1 byte length) should always be printed as unicode escapes
-                                if (ulen == 1 or options.string.String.escape_unicode) {
-                                    const codepoint = std.unicode.utf8Decode(value[i .. i + ulen]) catch unreachable;
-                                    try outputUnicodeEscape(codepoint, out_stream);
+                                if ((value[i] < 128) or options.string.String.escape_unicode) {
+                                    const c = std.unicode.utf8Decode(value[i..]) catch unreachable;
+                                    try outputUnicodeEscape(c.codepoint, out_stream);
+                                    i += c.utf8len - 1;
                                 } else {
+                                    var ulen = std.unicode.utf8ByteSequenceLength(value[i]) catch unreachable;
                                     try out_stream.writeAll(value[i .. i + ulen]);
+                                    i += ulen - 1;
                                 }
-                                i += ulen - 1;
                             },
                         }
                     }
diff --git a/lib/std/process.zig b/lib/std/process.zig
index 6ffb7bc1bcf1..432b70811d69 100644
--- a/lib/std/process.zig
+++ b/lib/std/process.zig
@@ -156,7 +156,7 @@ pub const GetEnvVarOwnedError = error{
 pub fn getEnvVarOwned(allocator: *mem.Allocator, key: []const u8) GetEnvVarOwnedError![]u8 {
     if (builtin.os.tag == .windows) {
         const result_w = blk: {
-            const key_w = try std.unicode.utf8ToUtf16LeWithNull(allocator, key);
+            const key_w = std.unicode.utf8ToUtf16LeWithNull(allocator, key) catch return error.InvalidUtf8;
             defer allocator.free(key_w);
 
             break :blk std.os.getenvW(key_w) orelse return error.EnvironmentVariableNotFound;
diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig
index df2e16a4bf73..bf77b1f51550 100644
--- a/lib/std/unicode.zig
+++ b/lib/std/unicode.zig
@@ -1,9 +1,56 @@
-const std = @import("./std.zig");
-const builtin = @import("builtin");
+const std = @import("std");
 const assert = std.debug.assert;
 const testing = std.testing;
 const mem = std.mem;
 
+// While these are the errors, the return types of the functions cannot be
+// set because the ErrorSet type is too buggy in stage1.
+pub const Utf8Error = UnicodeError || error{
+    Utf8ShortChar,
+    Utf8OverlongEncoding,
+    Utf8InvalidStartByte,
+};
+
+pub const UnicodeError = error{
+    UnicodeSurrogateHalf,
+    UnicodeCodepointTooLarge,
+};
+
+// http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94
+//
+// Table 3-7. Well-Formed UTF-8 Byte Sequences
+//
+// +--------------------+------------+-------------+------------+-------------+
+// | Code Points        | First Byte | Second Byte | Third Byte | Fourth Byte |
+// +--------------------+------------+-------------+------------+-------------+
+// | U+0000..U+007F     | 00..7F     |             |            |             |
+// +--------------------+------------+-------------+------------+-------------+
+// | U+0080..U+07FF     | C2..DF     | 80..BF      |            |             |
+// +--------------------+------------+-------------+------------+-------------+
+// | U+0800..U+0FFF     | E0         | A0..BF      | 80..BF     |             |
+// +--------------------+------------+-------------+------------+-------------+
+// | U+1000..U+CFFF     | E1..EC     | 80..BF      | 80..BF     |             |
+// +--------------------+------------+-------------+------------+-------------+
+// | U+D000..U+D7FF     | ED         | 80..9F      | 80..BF     |             |
+// +--------------------+------------+-------------+------------+-------------+
+// | U+E000..U+FFFF     | EE..EF     | 80..BF      | 80..BF     |             |
+// +--------------------+------------+-------------+------------+-------------+
+// | U+10000..U+3FFFF   | F0         | 90..BF      | 80..BF     | 80..BF      |
+// +--------------------+------------+-------------+------------+-------------+
+// | U+40000..U+FFFFF   | F1..F3     | 80..BF      | 80..BF     | 80..BF      |
+// +--------------------+------------+-------------+------------+-------------+
+// | U+100000..U+10FFFF | F4         | 80..8F      | 80..BF     | 80..BF      |
+// +--------------------+------------+-------------+------------+-------------+
+
+pub fn isValidUnicode(c: u21) !void {
+    switch (c) {
+        0x0000...0xd7ff => {},
+        0xd800...0xdfff => return error.UnicodeSurrogateHalf,
+        0xe000...0x10ffff => {},
+        0x110000...0x1ffffff => return error.UnicodeCodepointTooLarge,
+    }
+}
+
 /// Returns how many bytes the UTF-8 representation would require
 /// for the given codepoint.
 pub fn utf8CodepointSequenceLength(c: u21) !u3 {
@@ -11,7 +58,7 @@ pub fn utf8CodepointSequenceLength(c: u21) !u3 {
     if (c < 0x800) return @as(u3, 2);
     if (c < 0x10000) return @as(u3, 3);
     if (c < 0x110000) return @as(u3, 4);
-    return error.CodepointTooLarge;
+    return error.UnicodeCodepointTooLarge;
 }
 
 /// Given the first byte of a UTF-8 codepoint,
@@ -46,7 +93,7 @@ pub fn utf8Encode(c: u21, out: []u8) !u3 {
             out[1] = @intCast(u8, 0b10000000 | (c & 0b111111));
         },
         3 => {
-            if (0xd800 <= c and c <= 0xdfff) return error.Utf8CannotEncodeSurrogateHalf;
+            if (0xd800 <= c and c <= 0xdfff) return error.UnicodeSurrogateHalf;
             out[0] = @intCast(u8, 0b11100000 | (c >> 12));
             out[1] = @intCast(u8, 0b10000000 | ((c >> 6) & 0b111111));
             out[2] = @intCast(u8, 0b10000000 | (c & 0b111111));
@@ -62,32 +109,41 @@ pub fn utf8Encode(c: u21, out: []u8) !u3 {
     return length;
 }
 
-const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error;
+/// Decodes the UTF-8 codepoint encoded in the given slice of bytes and returns
+/// then length of the character decoded.
+///
+/// Guaranteed to not read bytes past this character.
+///
+/// I wish I didn't have to give this struct a name, but we don't have multiple
+/// return values.
+pub const UnicodeWithUtf8Len = struct {
+    codepoint: u21,
+    utf8len: u3,
+};
 
-/// Decodes the UTF-8 codepoint encoded in the given slice of bytes.
-/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
-/// If you already know the length at comptime, you can call one of
-/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
-pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u21 {
-    return switch (bytes.len) {
-        1 => @as(u21, bytes[0]),
-        2 => utf8Decode2(bytes),
-        3 => utf8Decode3(bytes),
-        4 => utf8Decode4(bytes),
-        else => unreachable,
+pub fn utf8Decode(bytes: []const u8) !UnicodeWithUtf8Len {
+    var len = try utf8ByteSequenceLength(bytes[0]);
+    if (bytes.len < len) {
+        return error.Utf8ShortChar;
+    }
+    return UnicodeWithUtf8Len{
+        .codepoint = switch (len) {
+            1 => @as(u21, bytes[0]),
+            2 => try utf8Decode2(bytes[0..2]),
+            3 => try utf8Decode3(bytes[0..3]),
+            4 => try utf8Decode4(bytes[0..4]),
+            else => unreachable,
+        },
+        .utf8len = len,
     };
 }
 
-const Utf8Decode2Error = error{
-    Utf8ExpectedContinuation,
-    Utf8OverlongEncoding,
-};
-pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 {
+pub fn utf8Decode2(bytes: []const u8) !u21 {
     assert(bytes.len == 2);
-    assert(bytes[0] & 0b11100000 == 0b11000000);
+    assert(@clz(u8, ~bytes[0]) == 2);
     var value: u21 = bytes[0] & 0b00011111;
 
-    if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
+    if (@clz(u8, ~bytes[1]) != 1) return error.Utf8ShortChar;
     value <<= 6;
     value |= bytes[1] & 0b00111111;
 
@@ -96,74 +152,67 @@ pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 {
     return value;
 }
 
-const Utf8Decode3Error = error{
-    Utf8ExpectedContinuation,
-    Utf8OverlongEncoding,
-    Utf8EncodesSurrogateHalf,
-};
-pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 {
+pub fn utf8Decode3(bytes: []const u8) !u21 {
     assert(bytes.len == 3);
-    assert(bytes[0] & 0b11110000 == 0b11100000);
+    assert(@clz(u8, ~bytes[0]) == 3);
     var value: u21 = bytes[0] & 0b00001111;
 
-    if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
+    if (@clz(u8, ~bytes[1]) != 1) return error.Utf8ShortChar;
     value <<= 6;
     value |= bytes[1] & 0b00111111;
 
-    if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
+    if (@clz(u8, ~bytes[2]) != 1) return error.Utf8ShortChar;
     value <<= 6;
     value |= bytes[2] & 0b00111111;
 
     if (value < 0x800) return error.Utf8OverlongEncoding;
-    if (0xd800 <= value and value <= 0xdfff) return error.Utf8EncodesSurrogateHalf;
+    if (0xd800 <= value and value <= 0xdfff) return error.UnicodeSurrogateHalf;
 
     return value;
 }
 
-const Utf8Decode4Error = error{
-    Utf8ExpectedContinuation,
-    Utf8OverlongEncoding,
-    Utf8CodepointTooLarge,
-};
-pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 {
+pub fn utf8Decode4(bytes: []const u8) !u21 {
     assert(bytes.len == 4);
-    assert(bytes[0] & 0b11111000 == 0b11110000);
+    assert(@clz(u8, ~bytes[0]) == 4);
     var value: u21 = bytes[0] & 0b00000111;
 
-    if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
+    if (@clz(u8, ~bytes[1]) != 1) return error.Utf8ShortChar;
     value <<= 6;
     value |= bytes[1] & 0b00111111;
 
-    if (bytes[2] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
+    if (@clz(u8, ~bytes[2]) != 1) return error.Utf8ShortChar;
     value <<= 6;
     value |= bytes[2] & 0b00111111;
 
-    if (bytes[3] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
+    if (@clz(u8, ~bytes[3]) != 1) return error.Utf8ShortChar;
     value <<= 6;
     value |= bytes[3] & 0b00111111;
 
     if (value < 0x10000) return error.Utf8OverlongEncoding;
-    if (value > 0x10FFFF) return error.Utf8CodepointTooLarge;
+    if (value > 0x10FFFF) return error.UnicodeCodepointTooLarge;
 
     return value;
 }
 
-pub fn utf8ValidateSlice(s: []const u8) bool {
+// TODO replace with something faster:
+// https://github.com/cyb70289/utf8/
+// https://lemire.me/blog/2018/10/19/validating-utf-8-bytes-using-only-0-45-cycles-per-byte-avx-edition/
+pub fn utf8ValidateSliceWithLoc(s: []const u8, ret_invalid_maybe: ?*usize) !void {
     var i: usize = 0;
     while (i < s.len) {
-        if (utf8ByteSequenceLength(s[i])) |cp_len| {
-            if (i + cp_len > s.len) {
-                return false;
-            }
-
-            if (utf8Decode(s[i .. i + cp_len])) |_| {} else |_| {
-                return false;
+        const c = utf8Decode(s[i..]) catch |err| {
+            if (ret_invalid_maybe) |ret_invalid| {
+                ret_invalid.* = i;
             }
-            i += cp_len;
-        } else |err| {
-            return false;
-        }
+            return err;
+        };
+        i += c.utf8len;
     }
+    return;
+}
+
+pub fn utf8ValidateSlice(s: []const u8) bool {
+    utf8ValidateSliceWithLoc(s, null) catch return false;
     return true;
 }
 
@@ -179,10 +228,7 @@ pub const Utf8View = struct {
     bytes: []const u8,
 
     pub fn init(s: []const u8) !Utf8View {
-        if (!utf8ValidateSlice(s)) {
-            return error.InvalidUtf8;
-        }
-
+        try utf8ValidateSliceWithLoc(s, null);
         return initUnchecked(s);
     }
 
@@ -194,11 +240,9 @@ pub const Utf8View = struct {
     pub fn initComptime(comptime s: []const u8) Utf8View {
         if (comptime init(s)) |r| {
             return r;
-        } else |err| switch (err) {
-            error.InvalidUtf8 => {
-                @compileError("invalid utf8");
-                unreachable;
-            },
+        } else |err| {
+            @compileError("invalid utf8");
+            unreachable;
         }
     }
 
@@ -214,26 +258,24 @@ pub const Utf8Iterator = struct {
     bytes: []const u8,
     i: usize,
 
-    pub fn nextCodepointSlice(it: *Utf8Iterator) ?[]const u8 {
+    pub fn nextCodepointSlice(it: *Utf8Iterator) !?[]const u8 {
         if (it.i >= it.bytes.len) {
             return null;
         }
 
-        const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch unreachable;
+        const cp_len = try utf8ByteSequenceLength(it.bytes[it.i]);
         it.i += cp_len;
         return it.bytes[it.i - cp_len .. it.i];
     }
 
-    pub fn nextCodepoint(it: *Utf8Iterator) ?u21 {
-        const slice = it.nextCodepointSlice() orelse return null;
-
-        switch (slice.len) {
-            1 => return @as(u21, slice[0]),
-            2 => return utf8Decode2(slice) catch unreachable,
-            3 => return utf8Decode3(slice) catch unreachable,
-            4 => return utf8Decode4(slice) catch unreachable,
-            else => unreachable,
+    pub fn nextCodepoint(it: *Utf8Iterator) !?u21 {
+        if (it.i >= it.bytes.len) {
+            return null;
         }
+
+        const c = try utf8Decode(it.bytes[it.i..]);
+        it.i += c.utf8len;
+        return c.codepoint;
     }
 };
 
@@ -251,60 +293,33 @@ pub const Utf16LeIterator = struct {
     pub fn nextCodepoint(it: *Utf16LeIterator) !?u21 {
         assert(it.i <= it.bytes.len);
         if (it.i == it.bytes.len) return null;
-        const c0: u21 = mem.readIntLittle(u16, it.bytes[it.i..][0..2]);
-        if (c0 & ~@as(u21, 0x03ff) == 0xd800) {
+        const c0: u32 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]);
+        if (c0 & ~@as(u32, 0x03ff) == 0xd800) {
             // surrogate pair
             it.i += 2;
             if (it.i >= it.bytes.len) return error.DanglingSurrogateHalf;
-            const c1: u21 = mem.readIntLittle(u16, it.bytes[it.i..][0..2]);
-            if (c1 & ~@as(u21, 0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf;
+            const c1: u32 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]);
+            if (c1 & ~@as(u32, 0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf;
             it.i += 2;
-            return 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff));
-        } else if (c0 & ~@as(u21, 0x03ff) == 0xdc00) {
+            return @truncate(u21, 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)));
+        } else if (c0 & ~@as(u32, 0x03ff) == 0xdc00) {
             return error.UnexpectedSecondSurrogateHalf;
         } else {
             it.i += 2;
-            return c0;
+            return @truncate(u21, c0);
         }
     }
 };
 
-test "utf8 encode" {
-    comptime testUtf8Encode() catch unreachable;
-    try testUtf8Encode();
-}
-fn testUtf8Encode() !void {
-    // A few taken from wikipedia a few taken elsewhere
-    var array: [4]u8 = undefined;
-    testing.expect((try utf8Encode(try utf8Decode("€"), array[0..])) == 3);
-    testing.expect(array[0] == 0b11100010);
-    testing.expect(array[1] == 0b10000010);
-    testing.expect(array[2] == 0b10101100);
-
-    testing.expect((try utf8Encode(try utf8Decode("$"), array[0..])) == 1);
-    testing.expect(array[0] == 0b00100100);
-
-    testing.expect((try utf8Encode(try utf8Decode("¢"), array[0..])) == 2);
-    testing.expect(array[0] == 0b11000010);
-    testing.expect(array[1] == 0b10100010);
-
-    testing.expect((try utf8Encode(try utf8Decode("𐍈"), array[0..])) == 4);
-    testing.expect(array[0] == 0b11110000);
-    testing.expect(array[1] == 0b10010000);
-    testing.expect(array[2] == 0b10001101);
-    testing.expect(array[3] == 0b10001000);
-}
-
 test "utf8 encode error" {
     comptime testUtf8EncodeError();
     testUtf8EncodeError();
 }
 fn testUtf8EncodeError() void {
     var array: [4]u8 = undefined;
-    testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf);
-    testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf);
-    testErrorEncode(0x110000, array[0..], error.CodepointTooLarge);
-    testErrorEncode(0x1fffff, array[0..], error.CodepointTooLarge);
+    testErrorEncode(0xd800, array[0..], error.UnicodeSurrogateHalf);
+    testErrorEncode(0xdfff, array[0..], error.UnicodeSurrogateHalf);
+    testErrorEncode(0x110000, array[0..], error.UnicodeCodepointTooLarge);
 }
 
 fn testErrorEncode(codePoint: u21, array: []u8, expectedErr: anyerror) void {
@@ -312,23 +327,23 @@ fn testErrorEncode(codePoint: u21, array: []u8, expectedErr: anyerror) void {
 }
 
 test "utf8 iterator on ascii" {
-    comptime testUtf8IteratorOnAscii();
-    testUtf8IteratorOnAscii();
+    try comptime testUtf8IteratorOnAscii();
+    try testUtf8IteratorOnAscii();
 }
-fn testUtf8IteratorOnAscii() void {
+fn testUtf8IteratorOnAscii() !void {
     const s = Utf8View.initComptime("abc");
 
     var it1 = s.iterator();
-    testing.expect(std.mem.eql(u8, "a", it1.nextCodepointSlice().?));
-    testing.expect(std.mem.eql(u8, "b", it1.nextCodepointSlice().?));
-    testing.expect(std.mem.eql(u8, "c", it1.nextCodepointSlice().?));
-    testing.expect(it1.nextCodepointSlice() == null);
+    testing.expect(std.mem.eql(u8, "a", (try it1.nextCodepointSlice()).?));
+    testing.expect(std.mem.eql(u8, "b", (try it1.nextCodepointSlice()).?));
+    testing.expect(std.mem.eql(u8, "c", (try it1.nextCodepointSlice()).?));
+    testing.expect((try it1.nextCodepointSlice()) == null);
 
     var it2 = s.iterator();
-    testing.expect(it2.nextCodepoint().? == 'a');
-    testing.expect(it2.nextCodepoint().? == 'b');
-    testing.expect(it2.nextCodepoint().? == 'c');
-    testing.expect(it2.nextCodepoint() == null);
+    testing.expect((try it2.nextCodepoint()).? == 'a');
+    testing.expect((try it2.nextCodepoint()).? == 'b');
+    testing.expect((try it2.nextCodepoint()).? == 'c');
+    testing.expect((try it2.nextCodepoint()) == null);
 }
 
 test "utf8 view bad" {
@@ -338,27 +353,27 @@ test "utf8 view bad" {
 fn testUtf8ViewBad() void {
     // Compile-time error.
     // const s3 = Utf8View.initComptime("\xfe\xf2");
-    testing.expectError(error.InvalidUtf8, Utf8View.init("hel\xadlo"));
+    testing.expectError(error.Utf8InvalidStartByte, Utf8View.init("hel\xadlo"));
 }
 
 test "utf8 view ok" {
-    comptime testUtf8ViewOk();
-    testUtf8ViewOk();
+    try comptime testUtf8ViewOk();
+    try testUtf8ViewOk();
 }
-fn testUtf8ViewOk() void {
+fn testUtf8ViewOk() !void {
     const s = Utf8View.initComptime("東京市");
 
     var it1 = s.iterator();
-    testing.expect(std.mem.eql(u8, "東", it1.nextCodepointSlice().?));
-    testing.expect(std.mem.eql(u8, "京", it1.nextCodepointSlice().?));
-    testing.expect(std.mem.eql(u8, "市", it1.nextCodepointSlice().?));
-    testing.expect(it1.nextCodepointSlice() == null);
+    testing.expect(std.mem.eql(u8, "東", (try it1.nextCodepointSlice()).?));
+    testing.expect(std.mem.eql(u8, "京", (try it1.nextCodepointSlice()).?));
+    testing.expect(std.mem.eql(u8, "市", (try it1.nextCodepointSlice()).?));
+    testing.expect((try it1.nextCodepointSlice()) == null);
 
     var it2 = s.iterator();
-    testing.expect(it2.nextCodepoint().? == 0x6771);
-    testing.expect(it2.nextCodepoint().? == 0x4eac);
-    testing.expect(it2.nextCodepoint().? == 0x5e02);
-    testing.expect(it2.nextCodepoint() == null);
+    testing.expect((try it2.nextCodepoint()).? == 0x6771);
+    testing.expect((try it2.nextCodepoint()).? == 0x4eac);
+    testing.expect((try it2.nextCodepoint()).? == 0x5e02);
+    testing.expect((try it2.nextCodepoint()) == null);
 }
 
 test "bad utf8 slice" {
@@ -403,24 +418,24 @@ fn testInvalidUtf8ContinuationBytes() void {
     testError("\xf8", error.Utf8InvalidStartByte);
     testError("\xff", error.Utf8InvalidStartByte);
     // expected continuation for 2 byte sequences
-    testError("\xc2", error.UnexpectedEof);
-    testError("\xc2\x00", error.Utf8ExpectedContinuation);
-    testError("\xc2\xc0", error.Utf8ExpectedContinuation);
+    testError("\xc2", error.Utf8ShortChar);
+    testError("\xc2\x00", error.Utf8ShortChar);
+    testError("\xc2\xc0", error.Utf8ShortChar);
     // expected continuation for 3 byte sequences
-    testError("\xe0", error.UnexpectedEof);
-    testError("\xe0\x00", error.UnexpectedEof);
-    testError("\xe0\xc0", error.UnexpectedEof);
-    testError("\xe0\xa0", error.UnexpectedEof);
-    testError("\xe0\xa0\x00", error.Utf8ExpectedContinuation);
-    testError("\xe0\xa0\xc0", error.Utf8ExpectedContinuation);
+    testError("\xe0", error.Utf8ShortChar);
+    testError("\xe0\x00", error.Utf8ShortChar);
+    testError("\xe0\xc0", error.Utf8ShortChar);
+    testError("\xe0\xa0", error.Utf8ShortChar);
+    testError("\xe0\xa0\x00", error.Utf8ShortChar);
+    testError("\xe0\xa0\xc0", error.Utf8ShortChar);
     // expected continuation for 4 byte sequences
-    testError("\xf0", error.UnexpectedEof);
-    testError("\xf0\x00", error.UnexpectedEof);
-    testError("\xf0\xc0", error.UnexpectedEof);
-    testError("\xf0\x90\x00", error.UnexpectedEof);
-    testError("\xf0\x90\xc0", error.UnexpectedEof);
-    testError("\xf0\x90\x80\x00", error.Utf8ExpectedContinuation);
-    testError("\xf0\x90\x80\xc0", error.Utf8ExpectedContinuation);
+    testError("\xf0", error.Utf8ShortChar);
+    testError("\xf0\x00", error.Utf8ShortChar);
+    testError("\xf0\xc0", error.Utf8ShortChar);
+    testError("\xf0\x90\x00", error.Utf8ShortChar);
+    testError("\xf0\x90\xc0", error.Utf8ShortChar);
+    testError("\xf0\x90\x80\x00", error.Utf8ShortChar);
+    testError("\xf0\x90\x80\xc0", error.Utf8ShortChar);
 }
 
 test "overlong utf8 codepoint" {
@@ -442,12 +457,12 @@ test "misc invalid utf8" {
 }
 fn testMiscInvalidUtf8() void {
     // codepoint out of bounds
-    testError("\xf4\x90\x80\x80", error.Utf8CodepointTooLarge);
-    testError("\xf7\xbf\xbf\xbf", error.Utf8CodepointTooLarge);
+    testError("\xf4\x90\x80\x80", error.UnicodeCodepointTooLarge);
+    testError("\xf7\xbf\xbf\xbf", error.UnicodeCodepointTooLarge);
     // surrogate halves
     testValid("\xed\x9f\xbf", 0xd7ff);
-    testError("\xed\xa0\x80", error.Utf8EncodesSurrogateHalf);
-    testError("\xed\xbf\xbf", error.Utf8EncodesSurrogateHalf);
+    testError("\xed\xa0\x80", error.UnicodeSurrogateHalf);
+    testError("\xed\xbf\xbf", error.UnicodeSurrogateHalf);
     testValid("\xee\x80\x80", 0xe000);
 }
 
@@ -455,15 +470,16 @@ fn testError(bytes: []const u8, expected_err: anyerror) void {
     testing.expectError(expected_err, testDecode(bytes));
 }
 
-fn testValid(bytes: []const u8, expected_codepoint: u21) void {
+fn testValid(bytes: []const u8, expected_codepoint: u32) void {
     testing.expect((testDecode(bytes) catch unreachable) == expected_codepoint);
 }
 
-fn testDecode(bytes: []const u8) !u21 {
+fn testDecode(bytes: []const u8) !u32 {
     const length = try utf8ByteSequenceLength(bytes[0]);
-    if (bytes.len < length) return error.UnexpectedEof;
+    if (bytes.len < length) return error.Utf8ShortChar;
     testing.expect(bytes.len == length);
-    return utf8Decode(bytes);
+    const c = try utf8Decode(bytes);
+    return @as(u32, c.codepoint);
 }
 
 /// Caller must free returned memory.
@@ -557,7 +573,7 @@ pub fn utf8ToUtf16LeWithNull(allocator: *mem.Allocator, utf8: []const u8) ![:0]u
 
     const view = try Utf8View.init(utf8);
     var it = view.iterator();
-    while (it.nextCodepoint()) |codepoint| {
+    while (try it.nextCodepoint()) |codepoint| {
         if (codepoint < 0x10000) {
             const short = @intCast(u16, codepoint);
             try result.append(mem.nativeToLittle(u16, short));
@@ -582,21 +598,19 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize {
     var dest_i: usize = 0;
     var src_i: usize = 0;
     while (src_i < utf8.len) {
-        const n = utf8ByteSequenceLength(utf8[src_i]) catch return error.InvalidUtf8;
-        const next_src_i = src_i + n;
-        const codepoint = utf8Decode(utf8[src_i..next_src_i]) catch return error.InvalidUtf8;
-        if (codepoint < 0x10000) {
-            const short = @intCast(u16, codepoint);
+        const c = utf8Decode(utf8[src_i..]) catch return error.InvalidUtf8;
+        if (c.codepoint < 0x10000) {
+            const short = @intCast(u16, c.codepoint);
             utf16le[dest_i] = mem.nativeToLittle(u16, short);
             dest_i += 1;
         } else {
-            const high = @intCast(u16, (codepoint - 0x10000) >> 10) + 0xD800;
-            const low = @intCast(u16, codepoint & 0x3FF) + 0xDC00;
+            const high = @intCast(u16, (c.codepoint - 0x10000) >> 10) + 0xD800;
+            const low = @intCast(u16, c.codepoint & 0x3FF) + 0xDC00;
             utf16le[dest_i] = mem.nativeToLittle(u16, high);
             utf16le[dest_i + 1] = mem.nativeToLittle(u16, low);
             dest_i += 2;
         }
-        src_i = next_src_i;
+        src_i = src_i + c.utf8len;
     }
     return dest_i;
 }
@@ -646,15 +660,13 @@ fn calcUtf16LeLen(utf8: []const u8) usize {
     var src_i: usize = 0;
     var dest_len: usize = 0;
     while (src_i < utf8.len) {
-        const n = utf8ByteSequenceLength(utf8[src_i]) catch unreachable;
-        const next_src_i = src_i + n;
-        const codepoint = utf8Decode(utf8[src_i..next_src_i]) catch unreachable;
-        if (codepoint < 0x10000) {
+        const c = utf8Decode(utf8[src_i..]) catch unreachable;
+        if (c.codepoint < 0x10000) {
             dest_len += 1;
         } else {
             dest_len += 2;
         }
-        src_i = next_src_i;
+        src_i = src_i + c.utf8len;
     }
     return dest_len;
 }