From a81c0ba2e730bd69e7c7a672fa795c46ce6b001b Mon Sep 17 00:00:00 2001 From: daurnimator Date: Sat, 28 Dec 2019 13:54:42 +1100 Subject: [PATCH 1/5] std: fix unicode encoding of astral plane codepoints to utf16 --- lib/std/unicode.zig | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index ece45691bea1..c2639f915804 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -555,9 +555,8 @@ pub fn utf8ToUtf16LeWithNull(allocator: *mem.Allocator, utf8: []const u8) ![]u16 const short = @intCast(u16, codepoint); try result.append(mem.nativeToLittle(u16, short)); } else { - const short = @intCast(u16, codepoint - 0x10000); - const high = (short >> 10) + 0xD800; - const low = (short & 0x3FF) + 0xDC00; + const high = @intCast(u16, (codepoint - 0x10000) >> 10) + 0xD800; + const low = @intCast(u16, codepoint & 0x3FF) + 0xDC00; var out: [2]u16 = undefined; out[0] = mem.nativeToLittle(u16, high); out[1] = mem.nativeToLittle(u16, low); @@ -592,9 +591,8 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize { utf16le[dest_i] = mem.nativeToLittle(u16, short); dest_i += 1; } else { - const short = @intCast(u16, codepoint - 0x10000); - const high = (short >> 10) + 0xD800; - const low = (short & 0x3FF) + 0xDC00; + const high = @intCast(u16, (codepoint - 0x10000) >> 10) + 0xD800; + const low = @intCast(u16, codepoint & 0x3FF) + 0xDC00; utf16le[dest_i] = mem.nativeToLittle(u16, high); utf16le[dest_i + 1] = mem.nativeToLittle(u16, low); dest_i += 2; @@ -609,14 +607,29 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize { test "utf8ToUtf16Le" { var utf16le: [2]u16 = [_]u16{0} ** 2; - const length = try utf8ToUtf16Le(utf16le[0..], "𐐷"); - testing.expect(@as(usize, 2) == length); - testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", @sliceToBytes(utf16le[0..])); + { + const length = try utf8ToUtf16Le(utf16le[0..], "𐐷"); + testing.expectEqual(@as(usize, 2), length); + testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", @sliceToBytes(utf16le[0..])); + } + { + const length = try utf8ToUtf16Le(utf16le[0..], "\u{10FFFF}"); + testing.expectEqual(@as(usize, 2), length); + testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", @sliceToBytes(utf16le[0..])); + } } test "utf8ToUtf16LeWithNull" { - var bytes: [128]u8 = undefined; - const allocator = &std.heap.FixedBufferAllocator.init(bytes[0..]).allocator; - const utf16 = try utf8ToUtf16LeWithNull(allocator, "𐐷"); - testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc\x00\x00", @sliceToBytes(utf16[0..])); + { + var bytes: [128]u8 = undefined; + const allocator = &std.heap.FixedBufferAllocator.init(bytes[0..]).allocator; + const utf16 = try utf8ToUtf16LeWithNull(allocator, "𐐷"); + testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc\x00\x00", @sliceToBytes(utf16[0..])); + } + { + var bytes: [128]u8 = undefined; + const allocator = &std.heap.FixedBufferAllocator.init(bytes[0..]).allocator; + const utf16 = try utf8ToUtf16LeWithNull(allocator, "\u{10FFFF}"); + testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf\x00\x00", @sliceToBytes(utf16[0..])); + } } From 8b72eedc76f46b978843344f92605acb4ee4061a Mon Sep 17 00:00:00 2001 From: daurnimator Date: Sat, 28 Dec 2019 14:23:27 +1100 Subject: [PATCH 2/5] std: add warm up phase to benchmark In my tests, whatever ran first was getting much better numbers. Additionally, add alignment requirements so that comparison is fair. --- lib/std/unicode/throughput_test.zig | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/std/unicode/throughput_test.zig b/lib/std/unicode/throughput_test.zig index a9a211d24ad5..922c25ebc6fc 100644 --- a/lib/std/unicode/throughput_test.zig +++ b/lib/std/unicode/throughput_test.zig @@ -6,18 +6,23 @@ pub fn main() !void { const args = try std.process.argsAlloc(std.heap.page_allocator); + // Warm up runs + var buffer0: [32767]u16 align(4096) = undefined; + _ = try std.unicode.utf8ToUtf16Le(&buffer0, args[1]); + _ = try std.unicode.utf8ToUtf16Le_better(&buffer0, args[1]); + @fence(.SeqCst); var timer = try std.time.Timer.start(); @fence(.SeqCst); - var buffer1: [32767]u16 = undefined; + var buffer1: [32767]u16 align(4096) = undefined; _ = try std.unicode.utf8ToUtf16Le(&buffer1, args[1]); @fence(.SeqCst); const elapsed_ns_orig = timer.lap(); @fence(.SeqCst); - var buffer2: [32767]u16 = undefined; + var buffer2: [32767] u16 align(4096) = undefined; _ = try std.unicode.utf8ToUtf16Le_better(&buffer2, args[1]); @fence(.SeqCst); From 5843a6e3bc1a6353f76ebca57d6099337b90139a Mon Sep 17 00:00:00 2001 From: daurnimator Date: Sat, 28 Dec 2019 14:34:00 +1100 Subject: [PATCH 3/5] std: optimise utf8ByteSequenceLength Also tested (but not as fast): ```zig pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { const len = @clz(u8, ~first_byte); if (len == 0) return 1; if (len < 4) return @intCast(u3, len); return error.Utf8InvalidStartByte; } ``` --- lib/std/unicode.zig | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index c2639f915804..88b6b0bd2c65 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -18,11 +18,13 @@ pub fn utf8CodepointSequenceLength(c: u32) !u3 { /// returns a number 1-4 indicating the total length of the codepoint in bytes. /// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte. pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { - if (first_byte < 0b10000000) return @as(u3, 1); - if (first_byte & 0b11100000 == 0b11000000) return @as(u3, 2); - if (first_byte & 0b11110000 == 0b11100000) return @as(u3, 3); - if (first_byte & 0b11111000 == 0b11110000) return @as(u3, 4); - return error.Utf8InvalidStartByte; + return switch (@clz(u8, ~first_byte)) { + 0 => 1, + 2 => 2, + 3 => 3, + 4 => 4, + else => error.Utf8InvalidStartByte, + }; } /// Encodes the given codepoint into a UTF-8 byte sequence. From ab6065407d74fc8d63d398c60f9fe653374d9d6d Mon Sep 17 00:00:00 2001 From: daurnimator Date: Sat, 28 Dec 2019 14:35:53 +1100 Subject: [PATCH 4/5] std: simplify utf8ToUtf16Le Also faster, on my machine unicode/throughput_test.zig now gives e.g. > original utf8ToUtf16Le: elapsed: 1048 ns (0 ms) > new utf8ToUtf16Le: elapsed: 971 ns (0 ms) --- lib/std/unicode.zig | 40 ++++++++++++++-------------------------- 1 file changed, 14 insertions(+), 26 deletions(-) diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index 88b6b0bd2c65..52fd5889e40e 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -576,33 +576,21 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize { var dest_i: usize = 0; var src_i: usize = 0; while (src_i < utf8.len) { - const byte = utf8[src_i]; - const n = @clz(u8, ~byte); - switch (n) { - 0 => { - utf16le[dest_i] = byte; - dest_i += 1; - src_i += 1; - continue; - }, - 2, 3, 4 => { - const next_src_i = src_i + n; - const codepoint = utf8Decode(utf8[src_i..next_src_i]) catch return error.InvalidUtf8; - if (codepoint < 0x10000) { - const short = @intCast(u16, codepoint); - utf16le[dest_i] = mem.nativeToLittle(u16, short); - dest_i += 1; - } else { - const high = @intCast(u16, (codepoint - 0x10000) >> 10) + 0xD800; - const low = @intCast(u16, codepoint & 0x3FF) + 0xDC00; - utf16le[dest_i] = mem.nativeToLittle(u16, high); - utf16le[dest_i + 1] = mem.nativeToLittle(u16, low); - dest_i += 2; - } - src_i = next_src_i; - }, - else => return error.InvalidUtf8, + const n = utf8ByteSequenceLength(utf8[src_i]) catch return error.InvalidUtf8; + const next_src_i = src_i + n; + const codepoint = utf8Decode(utf8[src_i..next_src_i]) catch return error.InvalidUtf8; + if (codepoint < 0x10000) { + const short = @intCast(u16, codepoint); + utf16le[dest_i] = mem.nativeToLittle(u16, short); + dest_i += 1; + } else { + const high = @intCast(u16, (codepoint - 0x10000) >> 10) + 0xD800; + const low = @intCast(u16, codepoint & 0x3FF) + 0xDC00; + utf16le[dest_i] = mem.nativeToLittle(u16, high); + utf16le[dest_i + 1] = mem.nativeToLittle(u16, low); + dest_i += 2; } + src_i = next_src_i; } return dest_i; } From edb5deb39cc923d6fe2d3de507757d8ba9d74d68 Mon Sep 17 00:00:00 2001 From: daurnimator Date: Sat, 28 Dec 2019 14:45:52 +1100 Subject: [PATCH 5/5] std: unicode codepoints are 21 bits --- lib/std/unicode.zig | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/lib/std/unicode.zig b/lib/std/unicode.zig index 52fd5889e40e..4dfe1c87d36f 100644 --- a/lib/std/unicode.zig +++ b/lib/std/unicode.zig @@ -6,7 +6,7 @@ const mem = std.mem; /// Returns how many bytes the UTF-8 representation would require /// for the given codepoint. -pub fn utf8CodepointSequenceLength(c: u32) !u3 { +pub fn utf8CodepointSequenceLength(c: u21) !u3 { if (c < 0x80) return @as(u3, 1); if (c < 0x800) return @as(u3, 2); if (c < 0x10000) return @as(u3, 3); @@ -32,7 +32,7 @@ pub fn utf8ByteSequenceLength(first_byte: u8) !u3 { /// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c). /// Errors: if c cannot be encoded in UTF-8. /// Returns: the number of bytes written to out. -pub fn utf8Encode(c: u32, out: []u8) !u3 { +pub fn utf8Encode(c: u21, out: []u8) !u3 { const length = try utf8CodepointSequenceLength(c); assert(out.len >= length); switch (length) { @@ -68,9 +68,9 @@ const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error /// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable. /// If you already know the length at comptime, you can call one of /// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function. -pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u32 { +pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u21 { return switch (bytes.len) { - 1 => @as(u32, bytes[0]), + 1 => @as(u21, bytes[0]), 2 => utf8Decode2(bytes), 3 => utf8Decode3(bytes), 4 => utf8Decode4(bytes), @@ -82,10 +82,10 @@ const Utf8Decode2Error = error{ Utf8ExpectedContinuation, Utf8OverlongEncoding, }; -pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u32 { +pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 { assert(bytes.len == 2); assert(bytes[0] & 0b11100000 == 0b11000000); - var value: u32 = bytes[0] & 0b00011111; + var value: u21 = bytes[0] & 0b00011111; if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; value <<= 6; @@ -101,10 +101,10 @@ const Utf8Decode3Error = error{ Utf8OverlongEncoding, Utf8EncodesSurrogateHalf, }; -pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u32 { +pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 { assert(bytes.len == 3); assert(bytes[0] & 0b11110000 == 0b11100000); - var value: u32 = bytes[0] & 0b00001111; + var value: u21 = bytes[0] & 0b00001111; if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; value <<= 6; @@ -125,10 +125,10 @@ const Utf8Decode4Error = error{ Utf8OverlongEncoding, Utf8CodepointTooLarge, }; -pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u32 { +pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 { assert(bytes.len == 4); assert(bytes[0] & 0b11111000 == 0b11110000); - var value: u32 = bytes[0] & 0b00000111; + var value: u21 = bytes[0] & 0b00000111; if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation; value <<= 6; @@ -224,11 +224,11 @@ pub const Utf8Iterator = struct { return it.bytes[it.i - cp_len .. it.i]; } - pub fn nextCodepoint(it: *Utf8Iterator) ?u32 { + pub fn nextCodepoint(it: *Utf8Iterator) ?u21 { const slice = it.nextCodepointSlice() orelse return null; switch (slice.len) { - 1 => return @as(u32, slice[0]), + 1 => return @as(u21, slice[0]), 2 => return utf8Decode2(slice) catch unreachable, 3 => return utf8Decode3(slice) catch unreachable, 4 => return utf8Decode4(slice) catch unreachable, @@ -248,19 +248,19 @@ pub const Utf16LeIterator = struct { }; } - pub fn nextCodepoint(it: *Utf16LeIterator) !?u32 { + pub fn nextCodepoint(it: *Utf16LeIterator) !?u21 { assert(it.i <= it.bytes.len); if (it.i == it.bytes.len) return null; - const c0: u32 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]); - if (c0 & ~@as(u32, 0x03ff) == 0xd800) { + const c0: u21 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]); + if (c0 & ~@as(u21, 0x03ff) == 0xd800) { // surrogate pair it.i += 2; if (it.i >= it.bytes.len) return error.DanglingSurrogateHalf; - const c1: u32 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]); - if (c1 & ~@as(u32, 0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf; + const c1: u21 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]); + if (c1 & ~@as(u21, 0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf; it.i += 2; return 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff)); - } else if (c0 & ~@as(u32, 0x03ff) == 0xdc00) { + } else if (c0 & ~@as(u21, 0x03ff) == 0xdc00) { return error.UnexpectedSecondSurrogateHalf; } else { it.i += 2; @@ -304,10 +304,10 @@ fn testUtf8EncodeError() void { testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf); testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf); testErrorEncode(0x110000, array[0..], error.CodepointTooLarge); - testErrorEncode(0xffffffff, array[0..], error.CodepointTooLarge); + testErrorEncode(0x1fffff, array[0..], error.CodepointTooLarge); } -fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: anyerror) void { +fn testErrorEncode(codePoint: u21, array: []u8, expectedErr: anyerror) void { testing.expectError(expectedErr, utf8Encode(codePoint, array)); } @@ -455,11 +455,11 @@ fn testError(bytes: []const u8, expected_err: anyerror) void { testing.expectError(expected_err, testDecode(bytes)); } -fn testValid(bytes: []const u8, expected_codepoint: u32) void { +fn testValid(bytes: []const u8, expected_codepoint: u21) void { testing.expect((testDecode(bytes) catch unreachable) == expected_codepoint); } -fn testDecode(bytes: []const u8) !u32 { +fn testDecode(bytes: []const u8) !u21 { const length = try utf8ByteSequenceLength(bytes[0]); if (bytes.len < length) return error.UnexpectedEof; testing.expect(bytes.len == length);