Skip to content

Commit cb02125

Browse files
authored
Merge pull request #3987 from daurnimator/std.unicode-fixes
std.unicode fixes
2 parents 48bf00b + edb5deb commit cb02125

File tree

2 files changed

+74
-66
lines changed

2 files changed

+74
-66
lines changed

lib/std/unicode.zig

Lines changed: 67 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ const mem = std.mem;
66

77
/// Returns how many bytes the UTF-8 representation would require
88
/// for the given codepoint.
9-
pub fn utf8CodepointSequenceLength(c: u32) !u3 {
9+
pub fn utf8CodepointSequenceLength(c: u21) !u3 {
1010
if (c < 0x80) return @as(u3, 1);
1111
if (c < 0x800) return @as(u3, 2);
1212
if (c < 0x10000) return @as(u3, 3);
@@ -18,19 +18,21 @@ pub fn utf8CodepointSequenceLength(c: u32) !u3 {
1818
/// returns a number 1-4 indicating the total length of the codepoint in bytes.
1919
/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
2020
pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {
21-
if (first_byte < 0b10000000) return @as(u3, 1);
22-
if (first_byte & 0b11100000 == 0b11000000) return @as(u3, 2);
23-
if (first_byte & 0b11110000 == 0b11100000) return @as(u3, 3);
24-
if (first_byte & 0b11111000 == 0b11110000) return @as(u3, 4);
25-
return error.Utf8InvalidStartByte;
21+
return switch (@clz(u8, ~first_byte)) {
22+
0 => 1,
23+
2 => 2,
24+
3 => 3,
25+
4 => 4,
26+
else => error.Utf8InvalidStartByte,
27+
};
2628
}
2729

2830
/// Encodes the given codepoint into a UTF-8 byte sequence.
2931
/// c: the codepoint.
3032
/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c).
3133
/// Errors: if c cannot be encoded in UTF-8.
3234
/// Returns: the number of bytes written to out.
33-
pub fn utf8Encode(c: u32, out: []u8) !u3 {
35+
pub fn utf8Encode(c: u21, out: []u8) !u3 {
3436
const length = try utf8CodepointSequenceLength(c);
3537
assert(out.len >= length);
3638
switch (length) {
@@ -66,9 +68,9 @@ const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error
6668
/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
6769
/// If you already know the length at comptime, you can call one of
6870
/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
69-
pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u32 {
71+
pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u21 {
7072
return switch (bytes.len) {
71-
1 => @as(u32, bytes[0]),
73+
1 => @as(u21, bytes[0]),
7274
2 => utf8Decode2(bytes),
7375
3 => utf8Decode3(bytes),
7476
4 => utf8Decode4(bytes),
@@ -80,10 +82,10 @@ const Utf8Decode2Error = error{
8082
Utf8ExpectedContinuation,
8183
Utf8OverlongEncoding,
8284
};
83-
pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u32 {
85+
pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 {
8486
assert(bytes.len == 2);
8587
assert(bytes[0] & 0b11100000 == 0b11000000);
86-
var value: u32 = bytes[0] & 0b00011111;
88+
var value: u21 = bytes[0] & 0b00011111;
8789

8890
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
8991
value <<= 6;
@@ -99,10 +101,10 @@ const Utf8Decode3Error = error{
99101
Utf8OverlongEncoding,
100102
Utf8EncodesSurrogateHalf,
101103
};
102-
pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u32 {
104+
pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 {
103105
assert(bytes.len == 3);
104106
assert(bytes[0] & 0b11110000 == 0b11100000);
105-
var value: u32 = bytes[0] & 0b00001111;
107+
var value: u21 = bytes[0] & 0b00001111;
106108

107109
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
108110
value <<= 6;
@@ -123,10 +125,10 @@ const Utf8Decode4Error = error{
123125
Utf8OverlongEncoding,
124126
Utf8CodepointTooLarge,
125127
};
126-
pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u32 {
128+
pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 {
127129
assert(bytes.len == 4);
128130
assert(bytes[0] & 0b11111000 == 0b11110000);
129-
var value: u32 = bytes[0] & 0b00000111;
131+
var value: u21 = bytes[0] & 0b00000111;
130132

131133
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
132134
value <<= 6;
@@ -222,11 +224,11 @@ pub const Utf8Iterator = struct {
222224
return it.bytes[it.i - cp_len .. it.i];
223225
}
224226

225-
pub fn nextCodepoint(it: *Utf8Iterator) ?u32 {
227+
pub fn nextCodepoint(it: *Utf8Iterator) ?u21 {
226228
const slice = it.nextCodepointSlice() orelse return null;
227229

228230
switch (slice.len) {
229-
1 => return @as(u32, slice[0]),
231+
1 => return @as(u21, slice[0]),
230232
2 => return utf8Decode2(slice) catch unreachable,
231233
3 => return utf8Decode3(slice) catch unreachable,
232234
4 => return utf8Decode4(slice) catch unreachable,
@@ -246,19 +248,19 @@ pub const Utf16LeIterator = struct {
246248
};
247249
}
248250

249-
pub fn nextCodepoint(it: *Utf16LeIterator) !?u32 {
251+
pub fn nextCodepoint(it: *Utf16LeIterator) !?u21 {
250252
assert(it.i <= it.bytes.len);
251253
if (it.i == it.bytes.len) return null;
252-
const c0: u32 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]);
253-
if (c0 & ~@as(u32, 0x03ff) == 0xd800) {
254+
const c0: u21 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]);
255+
if (c0 & ~@as(u21, 0x03ff) == 0xd800) {
254256
// surrogate pair
255257
it.i += 2;
256258
if (it.i >= it.bytes.len) return error.DanglingSurrogateHalf;
257-
const c1: u32 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]);
258-
if (c1 & ~@as(u32, 0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf;
259+
const c1: u21 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]);
260+
if (c1 & ~@as(u21, 0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf;
259261
it.i += 2;
260262
return 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff));
261-
} else if (c0 & ~@as(u32, 0x03ff) == 0xdc00) {
263+
} else if (c0 & ~@as(u21, 0x03ff) == 0xdc00) {
262264
return error.UnexpectedSecondSurrogateHalf;
263265
} else {
264266
it.i += 2;
@@ -302,10 +304,10 @@ fn testUtf8EncodeError() void {
302304
testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf);
303305
testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf);
304306
testErrorEncode(0x110000, array[0..], error.CodepointTooLarge);
305-
testErrorEncode(0xffffffff, array[0..], error.CodepointTooLarge);
307+
testErrorEncode(0x1fffff, array[0..], error.CodepointTooLarge);
306308
}
307309

308-
fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: anyerror) void {
310+
fn testErrorEncode(codePoint: u21, array: []u8, expectedErr: anyerror) void {
309311
testing.expectError(expectedErr, utf8Encode(codePoint, array));
310312
}
311313

@@ -453,11 +455,11 @@ fn testError(bytes: []const u8, expected_err: anyerror) void {
453455
testing.expectError(expected_err, testDecode(bytes));
454456
}
455457

456-
fn testValid(bytes: []const u8, expected_codepoint: u32) void {
458+
fn testValid(bytes: []const u8, expected_codepoint: u21) void {
457459
testing.expect((testDecode(bytes) catch unreachable) == expected_codepoint);
458460
}
459461

460-
fn testDecode(bytes: []const u8) !u32 {
462+
fn testDecode(bytes: []const u8) !u21 {
461463
const length = try utf8ByteSequenceLength(bytes[0]);
462464
if (bytes.len < length) return error.UnexpectedEof;
463465
testing.expect(bytes.len == length);
@@ -555,9 +557,8 @@ pub fn utf8ToUtf16LeWithNull(allocator: *mem.Allocator, utf8: []const u8) ![]u16
555557
const short = @intCast(u16, codepoint);
556558
try result.append(mem.nativeToLittle(u16, short));
557559
} else {
558-
const short = @intCast(u16, codepoint - 0x10000);
559-
const high = (short >> 10) + 0xD800;
560-
const low = (short & 0x3FF) + 0xDC00;
560+
const high = @intCast(u16, (codepoint - 0x10000) >> 10) + 0xD800;
561+
const low = @intCast(u16, codepoint & 0x3FF) + 0xDC00;
561562
var out: [2]u16 = undefined;
562563
out[0] = mem.nativeToLittle(u16, high);
563564
out[1] = mem.nativeToLittle(u16, low);
@@ -575,48 +576,50 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize {
575576
var dest_i: usize = 0;
576577
var src_i: usize = 0;
577578
while (src_i < utf8.len) {
578-
const byte = utf8[src_i];
579-
const n = @clz(u8, ~byte);
580-
switch (n) {
581-
0 => {
582-
utf16le[dest_i] = byte;
583-
dest_i += 1;
584-
src_i += 1;
585-
continue;
586-
},
587-
2, 3, 4 => {
588-
const next_src_i = src_i + n;
589-
const codepoint = utf8Decode(utf8[src_i..next_src_i]) catch return error.InvalidUtf8;
590-
if (codepoint < 0x10000) {
591-
const short = @intCast(u16, codepoint);
592-
utf16le[dest_i] = mem.nativeToLittle(u16, short);
593-
dest_i += 1;
594-
} else {
595-
const short = @intCast(u16, codepoint - 0x10000);
596-
const high = (short >> 10) + 0xD800;
597-
const low = (short & 0x3FF) + 0xDC00;
598-
utf16le[dest_i] = mem.nativeToLittle(u16, high);
599-
utf16le[dest_i + 1] = mem.nativeToLittle(u16, low);
600-
dest_i += 2;
601-
}
602-
src_i = next_src_i;
603-
},
604-
else => return error.InvalidUtf8,
579+
const n = utf8ByteSequenceLength(utf8[src_i]) catch return error.InvalidUtf8;
580+
const next_src_i = src_i + n;
581+
const codepoint = utf8Decode(utf8[src_i..next_src_i]) catch return error.InvalidUtf8;
582+
if (codepoint < 0x10000) {
583+
const short = @intCast(u16, codepoint);
584+
utf16le[dest_i] = mem.nativeToLittle(u16, short);
585+
dest_i += 1;
586+
} else {
587+
const high = @intCast(u16, (codepoint - 0x10000) >> 10) + 0xD800;
588+
const low = @intCast(u16, codepoint & 0x3FF) + 0xDC00;
589+
utf16le[dest_i] = mem.nativeToLittle(u16, high);
590+
utf16le[dest_i + 1] = mem.nativeToLittle(u16, low);
591+
dest_i += 2;
605592
}
593+
src_i = next_src_i;
606594
}
607595
return dest_i;
608596
}
609597

610598
test "utf8ToUtf16Le" {
611599
var utf16le: [2]u16 = [_]u16{0} ** 2;
612-
const length = try utf8ToUtf16Le(utf16le[0..], "𐐷");
613-
testing.expect(@as(usize, 2) == length);
614-
testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", @sliceToBytes(utf16le[0..]));
600+
{
601+
const length = try utf8ToUtf16Le(utf16le[0..], "𐐷");
602+
testing.expectEqual(@as(usize, 2), length);
603+
testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc", @sliceToBytes(utf16le[0..]));
604+
}
605+
{
606+
const length = try utf8ToUtf16Le(utf16le[0..], "\u{10FFFF}");
607+
testing.expectEqual(@as(usize, 2), length);
608+
testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf", @sliceToBytes(utf16le[0..]));
609+
}
615610
}
616611

617612
test "utf8ToUtf16LeWithNull" {
618-
var bytes: [128]u8 = undefined;
619-
const allocator = &std.heap.FixedBufferAllocator.init(bytes[0..]).allocator;
620-
const utf16 = try utf8ToUtf16LeWithNull(allocator, "𐐷");
621-
testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc\x00\x00", @sliceToBytes(utf16[0..]));
613+
{
614+
var bytes: [128]u8 = undefined;
615+
const allocator = &std.heap.FixedBufferAllocator.init(bytes[0..]).allocator;
616+
const utf16 = try utf8ToUtf16LeWithNull(allocator, "𐐷");
617+
testing.expectEqualSlices(u8, "\x01\xd8\x37\xdc\x00\x00", @sliceToBytes(utf16[0..]));
618+
}
619+
{
620+
var bytes: [128]u8 = undefined;
621+
const allocator = &std.heap.FixedBufferAllocator.init(bytes[0..]).allocator;
622+
const utf16 = try utf8ToUtf16LeWithNull(allocator, "\u{10FFFF}");
623+
testing.expectEqualSlices(u8, "\xff\xdb\xff\xdf\x00\x00", @sliceToBytes(utf16[0..]));
624+
}
622625
}

lib/std/unicode/throughput_test.zig

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,18 +6,23 @@ pub fn main() !void {
66

77
const args = try std.process.argsAlloc(std.heap.page_allocator);
88

9+
// Warm up runs
10+
var buffer0: [32767]u16 align(4096) = undefined;
11+
_ = try std.unicode.utf8ToUtf16Le(&buffer0, args[1]);
12+
_ = try std.unicode.utf8ToUtf16Le_better(&buffer0, args[1]);
13+
914
@fence(.SeqCst);
1015
var timer = try std.time.Timer.start();
1116
@fence(.SeqCst);
1217

13-
var buffer1: [32767]u16 = undefined;
18+
var buffer1: [32767]u16 align(4096) = undefined;
1419
_ = try std.unicode.utf8ToUtf16Le(&buffer1, args[1]);
1520

1621
@fence(.SeqCst);
1722
const elapsed_ns_orig = timer.lap();
1823
@fence(.SeqCst);
1924

20-
var buffer2: [32767]u16 = undefined;
25+
var buffer2: [32767] u16 align(4096) = undefined;
2126
_ = try std.unicode.utf8ToUtf16Le_better(&buffer2, args[1]);
2227

2328
@fence(.SeqCst);

0 commit comments

Comments
 (0)