Skip to content

Commit edb5deb

Browse files
committed
std: unicode codepoints are 21 bits
1 parent ab60654 commit edb5deb

File tree

1 file changed

+22
-22
lines changed

1 file changed

+22
-22
lines changed

lib/std/unicode.zig

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ const mem = std.mem;
66

77
/// Returns how many bytes the UTF-8 representation would require
88
/// for the given codepoint.
9-
pub fn utf8CodepointSequenceLength(c: u32) !u3 {
9+
pub fn utf8CodepointSequenceLength(c: u21) !u3 {
1010
if (c < 0x80) return @as(u3, 1);
1111
if (c < 0x800) return @as(u3, 2);
1212
if (c < 0x10000) return @as(u3, 3);
@@ -32,7 +32,7 @@ pub fn utf8ByteSequenceLength(first_byte: u8) !u3 {
3232
/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c).
3333
/// Errors: if c cannot be encoded in UTF-8.
3434
/// Returns: the number of bytes written to out.
35-
pub fn utf8Encode(c: u32, out: []u8) !u3 {
35+
pub fn utf8Encode(c: u21, out: []u8) !u3 {
3636
const length = try utf8CodepointSequenceLength(c);
3737
assert(out.len >= length);
3838
switch (length) {
@@ -68,9 +68,9 @@ const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error
6868
/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
6969
/// If you already know the length at comptime, you can call one of
7070
/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
71-
pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u32 {
71+
pub fn utf8Decode(bytes: []const u8) Utf8DecodeError!u21 {
7272
return switch (bytes.len) {
73-
1 => @as(u32, bytes[0]),
73+
1 => @as(u21, bytes[0]),
7474
2 => utf8Decode2(bytes),
7575
3 => utf8Decode3(bytes),
7676
4 => utf8Decode4(bytes),
@@ -82,10 +82,10 @@ const Utf8Decode2Error = error{
8282
Utf8ExpectedContinuation,
8383
Utf8OverlongEncoding,
8484
};
85-
pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u32 {
85+
pub fn utf8Decode2(bytes: []const u8) Utf8Decode2Error!u21 {
8686
assert(bytes.len == 2);
8787
assert(bytes[0] & 0b11100000 == 0b11000000);
88-
var value: u32 = bytes[0] & 0b00011111;
88+
var value: u21 = bytes[0] & 0b00011111;
8989

9090
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
9191
value <<= 6;
@@ -101,10 +101,10 @@ const Utf8Decode3Error = error{
101101
Utf8OverlongEncoding,
102102
Utf8EncodesSurrogateHalf,
103103
};
104-
pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u32 {
104+
pub fn utf8Decode3(bytes: []const u8) Utf8Decode3Error!u21 {
105105
assert(bytes.len == 3);
106106
assert(bytes[0] & 0b11110000 == 0b11100000);
107-
var value: u32 = bytes[0] & 0b00001111;
107+
var value: u21 = bytes[0] & 0b00001111;
108108

109109
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
110110
value <<= 6;
@@ -125,10 +125,10 @@ const Utf8Decode4Error = error{
125125
Utf8OverlongEncoding,
126126
Utf8CodepointTooLarge,
127127
};
128-
pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u32 {
128+
pub fn utf8Decode4(bytes: []const u8) Utf8Decode4Error!u21 {
129129
assert(bytes.len == 4);
130130
assert(bytes[0] & 0b11111000 == 0b11110000);
131-
var value: u32 = bytes[0] & 0b00000111;
131+
var value: u21 = bytes[0] & 0b00000111;
132132

133133
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
134134
value <<= 6;
@@ -224,11 +224,11 @@ pub const Utf8Iterator = struct {
224224
return it.bytes[it.i - cp_len .. it.i];
225225
}
226226

227-
pub fn nextCodepoint(it: *Utf8Iterator) ?u32 {
227+
pub fn nextCodepoint(it: *Utf8Iterator) ?u21 {
228228
const slice = it.nextCodepointSlice() orelse return null;
229229

230230
switch (slice.len) {
231-
1 => return @as(u32, slice[0]),
231+
1 => return @as(u21, slice[0]),
232232
2 => return utf8Decode2(slice) catch unreachable,
233233
3 => return utf8Decode3(slice) catch unreachable,
234234
4 => return utf8Decode4(slice) catch unreachable,
@@ -248,19 +248,19 @@ pub const Utf16LeIterator = struct {
248248
};
249249
}
250250

251-
pub fn nextCodepoint(it: *Utf16LeIterator) !?u32 {
251+
pub fn nextCodepoint(it: *Utf16LeIterator) !?u21 {
252252
assert(it.i <= it.bytes.len);
253253
if (it.i == it.bytes.len) return null;
254-
const c0: u32 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]);
255-
if (c0 & ~@as(u32, 0x03ff) == 0xd800) {
254+
const c0: u21 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]);
255+
if (c0 & ~@as(u21, 0x03ff) == 0xd800) {
256256
// surrogate pair
257257
it.i += 2;
258258
if (it.i >= it.bytes.len) return error.DanglingSurrogateHalf;
259-
const c1: u32 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]);
260-
if (c1 & ~@as(u32, 0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf;
259+
const c1: u21 = mem.readIntSliceLittle(u16, it.bytes[it.i .. it.i + 2]);
260+
if (c1 & ~@as(u21, 0x03ff) != 0xdc00) return error.ExpectedSecondSurrogateHalf;
261261
it.i += 2;
262262
return 0x10000 + (((c0 & 0x03ff) << 10) | (c1 & 0x03ff));
263-
} else if (c0 & ~@as(u32, 0x03ff) == 0xdc00) {
263+
} else if (c0 & ~@as(u21, 0x03ff) == 0xdc00) {
264264
return error.UnexpectedSecondSurrogateHalf;
265265
} else {
266266
it.i += 2;
@@ -304,10 +304,10 @@ fn testUtf8EncodeError() void {
304304
testErrorEncode(0xd800, array[0..], error.Utf8CannotEncodeSurrogateHalf);
305305
testErrorEncode(0xdfff, array[0..], error.Utf8CannotEncodeSurrogateHalf);
306306
testErrorEncode(0x110000, array[0..], error.CodepointTooLarge);
307-
testErrorEncode(0xffffffff, array[0..], error.CodepointTooLarge);
307+
testErrorEncode(0x1fffff, array[0..], error.CodepointTooLarge);
308308
}
309309

310-
fn testErrorEncode(codePoint: u32, array: []u8, expectedErr: anyerror) void {
310+
fn testErrorEncode(codePoint: u21, array: []u8, expectedErr: anyerror) void {
311311
testing.expectError(expectedErr, utf8Encode(codePoint, array));
312312
}
313313

@@ -455,11 +455,11 @@ fn testError(bytes: []const u8, expected_err: anyerror) void {
455455
testing.expectError(expected_err, testDecode(bytes));
456456
}
457457

458-
fn testValid(bytes: []const u8, expected_codepoint: u32) void {
458+
fn testValid(bytes: []const u8, expected_codepoint: u21) void {
459459
testing.expect((testDecode(bytes) catch unreachable) == expected_codepoint);
460460
}
461461

462-
fn testDecode(bytes: []const u8) !u32 {
462+
fn testDecode(bytes: []const u8) !u21 {
463463
const length = try utf8ByteSequenceLength(bytes[0]);
464464
if (bytes.len < length) return error.UnexpectedEof;
465465
testing.expect(bytes.len == length);

0 commit comments

Comments
 (0)