std: simplify utf8ToUtf16Le

daurnimator · daurnimator · commit ecdec177c370 · 2019-12-28T15:07:45.000+11:00
Also faster, on my machine unicode/throughput_test.zig now gives e.g.
&gt; original utf8ToUtf16Le: elapsed: 1048 ns (0 ms)
&gt; new utf8ToUtf16Le: elapsed: 971 ns (0 ms)
diff --git a/lib/std/os/windows.zig b/lib/std/os/windows.zig
@@ -1019,7 +1019,7 @@ pub fn sliceToPrefixedSuffixedFileW(s: []const u8, comptime suffix: []const u16)
         mem.copy(u16, result[0..], &prefix);
         break :blk prefix.len;
     };
-    const end_index = start_index + try std.unicode.utf8ToUtf16Le(result[start_index..], s);
+    const end_index = start_index + (std.unicode.utf8ToUtf16Le(result[start_index..], s) catch return error.InvalidUtf8);
     if (end_index + suffix.len > result.len) return error.NameTooLong;
     mem.copy(u16, result[end_index..], suffix);
     result[end_index + suffix.len] = 0;
diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig
@@ -576,33 +576,21 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize {
     var dest_i: usize = 0;
     var src_i: usize = 0;
     while (src_i < utf8.len) {
-        const byte = utf8[src_i];
-        const n = @clz(u8, ~byte);
-        switch (n) {
-            0 => {
-                utf16le[dest_i] = byte;
-                dest_i += 1;
-                src_i += 1;
-                continue;
-            },
-            2, 3, 4 => {
-                const next_src_i = src_i + n;
-                const codepoint = utf8Decode(utf8[src_i..next_src_i]) catch return error.InvalidUtf8;
-                if (codepoint < 0x10000) {
-                    const short = @intCast(u16, codepoint);
-                    utf16le[dest_i] = mem.nativeToLittle(u16, short);
-                    dest_i += 1;
-                } else {
-                    const high = @intCast(u16, (codepoint - 0x10000) >> 10) + 0xD800;
-                    const low = @intCast(u16, codepoint & 0x3FF) + 0xDC00;
-                    utf16le[dest_i] = mem.nativeToLittle(u16, high);
-                    utf16le[dest_i + 1] = mem.nativeToLittle(u16, low);
-                    dest_i += 2;
-                }
-                src_i = next_src_i;
-            },
-            else => return error.InvalidUtf8,
+        const n = try utf8ByteSequenceLength(utf8[src_i]);
+        const next_src_i = src_i + n;
+        const codepoint = try utf8Decode(utf8[src_i..next_src_i]);
+        if (codepoint < 0x10000) {
+            const short = @intCast(u16, codepoint);
+            utf16le[dest_i] = mem.nativeToLittle(u16, short);
+            dest_i += 1;
+        } else {
+            const high = @intCast(u16, (codepoint - 0x10000) >> 10) + 0xD800;
+            const low = @intCast(u16, codepoint & 0x3FF) + 0xDC00;
+            utf16le[dest_i] = mem.nativeToLittle(u16, high);
+            utf16le[dest_i + 1] = mem.nativeToLittle(u16, low);
+            dest_i += 2;
         }
+        src_i = next_src_i;
     }
     return dest_i;
 }