@@ -6,7 +6,7 @@ const mem = std.mem;
6
6
7
7
/// Returns how many bytes the UTF-8 representation would require
8
8
/// for the given codepoint.
9
- pub fn utf8CodepointSequenceLength (c : u32 ) ! u3 {
9
+ pub fn utf8CodepointSequenceLength (c : u21 ) ! u3 {
10
10
if (c < 0x80 ) return @as (u3 , 1 );
11
11
if (c < 0x800 ) return @as (u3 , 2 );
12
12
if (c < 0x10000 ) return @as (u3 , 3 );
@@ -18,19 +18,21 @@ pub fn utf8CodepointSequenceLength(c: u32) !u3 {
18
18
/// returns a number 1-4 indicating the total length of the codepoint in bytes.
19
19
/// If this byte does not match the form of a UTF-8 start byte, returns Utf8InvalidStartByte.
20
20
pub fn utf8ByteSequenceLength (first_byte : u8 ) ! u3 {
21
- if (first_byte < 0b10000000 ) return @as (u3 , 1 );
22
- if (first_byte & 0b11100000 == 0b11000000 ) return @as (u3 , 2 );
23
- if (first_byte & 0b11110000 == 0b11100000 ) return @as (u3 , 3 );
24
- if (first_byte & 0b11111000 == 0b11110000 ) return @as (u3 , 4 );
25
- return error .Utf8InvalidStartByte ;
21
+ return switch (@clz (u8 , ~ first_byte )) {
22
+ 0 = > 1 ,
23
+ 2 = > 2 ,
24
+ 3 = > 3 ,
25
+ 4 = > 4 ,
26
+ else = > error .Utf8InvalidStartByte ,
27
+ };
26
28
}
27
29
28
30
/// Encodes the given codepoint into a UTF-8 byte sequence.
29
31
/// c: the codepoint.
30
32
/// out: the out buffer to write to. Must have a len >= utf8CodepointSequenceLength(c).
31
33
/// Errors: if c cannot be encoded in UTF-8.
32
34
/// Returns: the number of bytes written to out.
33
- pub fn utf8Encode (c : u32 , out : []u8 ) ! u3 {
35
+ pub fn utf8Encode (c : u21 , out : []u8 ) ! u3 {
34
36
const length = try utf8CodepointSequenceLength (c );
35
37
assert (out .len >= length );
36
38
switch (length ) {
@@ -66,9 +68,9 @@ const Utf8DecodeError = Utf8Decode2Error || Utf8Decode3Error || Utf8Decode4Error
66
68
/// bytes.len must be equal to utf8ByteSequenceLength(bytes[0]) catch unreachable.
67
69
/// If you already know the length at comptime, you can call one of
68
70
/// utf8Decode2,utf8Decode3,utf8Decode4 directly instead of this function.
69
- pub fn utf8Decode (bytes : []const u8 ) Utf8DecodeError ! u32 {
71
+ pub fn utf8Decode (bytes : []const u8 ) Utf8DecodeError ! u21 {
70
72
return switch (bytes .len ) {
71
- 1 = > @as (u32 , bytes [0 ]),
73
+ 1 = > @as (u21 , bytes [0 ]),
72
74
2 = > utf8Decode2 (bytes ),
73
75
3 = > utf8Decode3 (bytes ),
74
76
4 = > utf8Decode4 (bytes ),
@@ -80,10 +82,10 @@ const Utf8Decode2Error = error{
80
82
Utf8ExpectedContinuation ,
81
83
Utf8OverlongEncoding ,
82
84
};
83
- pub fn utf8Decode2 (bytes : []const u8 ) Utf8Decode2Error ! u32 {
85
+ pub fn utf8Decode2 (bytes : []const u8 ) Utf8Decode2Error ! u21 {
84
86
assert (bytes .len == 2 );
85
87
assert (bytes [0 ] & 0b11100000 == 0b11000000 );
86
- var value : u32 = bytes [0 ] & 0b00011111 ;
88
+ var value : u21 = bytes [0 ] & 0b00011111 ;
87
89
88
90
if (bytes [1 ] & 0b11000000 != 0b10000000 ) return error .Utf8ExpectedContinuation ;
89
91
value <<= 6 ;
@@ -99,10 +101,10 @@ const Utf8Decode3Error = error{
99
101
Utf8OverlongEncoding ,
100
102
Utf8EncodesSurrogateHalf ,
101
103
};
102
- pub fn utf8Decode3 (bytes : []const u8 ) Utf8Decode3Error ! u32 {
104
+ pub fn utf8Decode3 (bytes : []const u8 ) Utf8Decode3Error ! u21 {
103
105
assert (bytes .len == 3 );
104
106
assert (bytes [0 ] & 0b11110000 == 0b11100000 );
105
- var value : u32 = bytes [0 ] & 0b00001111 ;
107
+ var value : u21 = bytes [0 ] & 0b00001111 ;
106
108
107
109
if (bytes [1 ] & 0b11000000 != 0b10000000 ) return error .Utf8ExpectedContinuation ;
108
110
value <<= 6 ;
@@ -123,10 +125,10 @@ const Utf8Decode4Error = error{
123
125
Utf8OverlongEncoding ,
124
126
Utf8CodepointTooLarge ,
125
127
};
126
- pub fn utf8Decode4 (bytes : []const u8 ) Utf8Decode4Error ! u32 {
128
+ pub fn utf8Decode4 (bytes : []const u8 ) Utf8Decode4Error ! u21 {
127
129
assert (bytes .len == 4 );
128
130
assert (bytes [0 ] & 0b11111000 == 0b11110000 );
129
- var value : u32 = bytes [0 ] & 0b00000111 ;
131
+ var value : u21 = bytes [0 ] & 0b00000111 ;
130
132
131
133
if (bytes [1 ] & 0b11000000 != 0b10000000 ) return error .Utf8ExpectedContinuation ;
132
134
value <<= 6 ;
@@ -222,11 +224,11 @@ pub const Utf8Iterator = struct {
222
224
return it .bytes [it .i - cp_len .. it .i ];
223
225
}
224
226
225
- pub fn nextCodepoint (it : * Utf8Iterator ) ? u32 {
227
+ pub fn nextCodepoint (it : * Utf8Iterator ) ? u21 {
226
228
const slice = it .nextCodepointSlice () orelse return null ;
227
229
228
230
switch (slice .len ) {
229
- 1 = > return @as (u32 , slice [0 ]),
231
+ 1 = > return @as (u21 , slice [0 ]),
230
232
2 = > return utf8Decode2 (slice ) catch unreachable ,
231
233
3 = > return utf8Decode3 (slice ) catch unreachable ,
232
234
4 = > return utf8Decode4 (slice ) catch unreachable ,
@@ -246,19 +248,19 @@ pub const Utf16LeIterator = struct {
246
248
};
247
249
}
248
250
249
- pub fn nextCodepoint (it : * Utf16LeIterator ) ! ? u32 {
251
+ pub fn nextCodepoint (it : * Utf16LeIterator ) ! ? u21 {
250
252
assert (it .i <= it .bytes .len );
251
253
if (it .i == it .bytes .len ) return null ;
252
- const c0 : u32 = mem .readIntSliceLittle (u16 , it .bytes [it .i .. it .i + 2 ]);
253
- if (c0 & ~ @as (u32 , 0x03ff ) == 0xd800 ) {
254
+ const c0 : u21 = mem .readIntSliceLittle (u16 , it .bytes [it .i .. it .i + 2 ]);
255
+ if (c0 & ~ @as (u21 , 0x03ff ) == 0xd800 ) {
254
256
// surrogate pair
255
257
it .i += 2 ;
256
258
if (it .i >= it .bytes .len ) return error .DanglingSurrogateHalf ;
257
- const c1 : u32 = mem .readIntSliceLittle (u16 , it .bytes [it .i .. it .i + 2 ]);
258
- if (c1 & ~ @as (u32 , 0x03ff ) != 0xdc00 ) return error .ExpectedSecondSurrogateHalf ;
259
+ const c1 : u21 = mem .readIntSliceLittle (u16 , it .bytes [it .i .. it .i + 2 ]);
260
+ if (c1 & ~ @as (u21 , 0x03ff ) != 0xdc00 ) return error .ExpectedSecondSurrogateHalf ;
259
261
it .i += 2 ;
260
262
return 0x10000 + (((c0 & 0x03ff ) << 10 ) | (c1 & 0x03ff ));
261
- } else if (c0 & ~ @as (u32 , 0x03ff ) == 0xdc00 ) {
263
+ } else if (c0 & ~ @as (u21 , 0x03ff ) == 0xdc00 ) {
262
264
return error .UnexpectedSecondSurrogateHalf ;
263
265
} else {
264
266
it .i += 2 ;
@@ -302,10 +304,10 @@ fn testUtf8EncodeError() void {
302
304
testErrorEncode (0xd800 , array [0.. ], error .Utf8CannotEncodeSurrogateHalf );
303
305
testErrorEncode (0xdfff , array [0.. ], error .Utf8CannotEncodeSurrogateHalf );
304
306
testErrorEncode (0x110000 , array [0.. ], error .CodepointTooLarge );
305
- testErrorEncode (0xffffffff , array [0.. ], error .CodepointTooLarge );
307
+ testErrorEncode (0x1fffff , array [0.. ], error .CodepointTooLarge );
306
308
}
307
309
308
- fn testErrorEncode (codePoint : u32 , array : []u8 , expectedErr : anyerror ) void {
310
+ fn testErrorEncode (codePoint : u21 , array : []u8 , expectedErr : anyerror ) void {
309
311
testing .expectError (expectedErr , utf8Encode (codePoint , array ));
310
312
}
311
313
@@ -453,11 +455,11 @@ fn testError(bytes: []const u8, expected_err: anyerror) void {
453
455
testing .expectError (expected_err , testDecode (bytes ));
454
456
}
455
457
456
- fn testValid (bytes : []const u8 , expected_codepoint : u32 ) void {
458
+ fn testValid (bytes : []const u8 , expected_codepoint : u21 ) void {
457
459
testing .expect ((testDecode (bytes ) catch unreachable ) == expected_codepoint );
458
460
}
459
461
460
- fn testDecode (bytes : []const u8 ) ! u32 {
462
+ fn testDecode (bytes : []const u8 ) ! u21 {
461
463
const length = try utf8ByteSequenceLength (bytes [0 ]);
462
464
if (bytes .len < length ) return error .UnexpectedEof ;
463
465
testing .expect (bytes .len == length );
@@ -555,9 +557,8 @@ pub fn utf8ToUtf16LeWithNull(allocator: *mem.Allocator, utf8: []const u8) ![]u16
555
557
const short = @intCast (u16 , codepoint );
556
558
try result .append (mem .nativeToLittle (u16 , short ));
557
559
} else {
558
- const short = @intCast (u16 , codepoint - 0x10000 );
559
- const high = (short >> 10 ) + 0xD800 ;
560
- const low = (short & 0x3FF ) + 0xDC00 ;
560
+ const high = @intCast (u16 , (codepoint - 0x10000 ) >> 10 ) + 0xD800 ;
561
+ const low = @intCast (u16 , codepoint & 0x3FF ) + 0xDC00 ;
561
562
var out : [2 ]u16 = undefined ;
562
563
out [0 ] = mem .nativeToLittle (u16 , high );
563
564
out [1 ] = mem .nativeToLittle (u16 , low );
@@ -575,48 +576,50 @@ pub fn utf8ToUtf16Le(utf16le: []u16, utf8: []const u8) !usize {
575
576
var dest_i : usize = 0 ;
576
577
var src_i : usize = 0 ;
577
578
while (src_i < utf8 .len ) {
578
- const byte = utf8 [src_i ];
579
- const n = @clz (u8 , ~ byte );
580
- switch (n ) {
581
- 0 = > {
582
- utf16le [dest_i ] = byte ;
583
- dest_i += 1 ;
584
- src_i += 1 ;
585
- continue ;
586
- },
587
- 2 , 3 , 4 = > {
588
- const next_src_i = src_i + n ;
589
- const codepoint = utf8Decode (utf8 [src_i .. next_src_i ]) catch return error .InvalidUtf8 ;
590
- if (codepoint < 0x10000 ) {
591
- const short = @intCast (u16 , codepoint );
592
- utf16le [dest_i ] = mem .nativeToLittle (u16 , short );
593
- dest_i += 1 ;
594
- } else {
595
- const short = @intCast (u16 , codepoint - 0x10000 );
596
- const high = (short >> 10 ) + 0xD800 ;
597
- const low = (short & 0x3FF ) + 0xDC00 ;
598
- utf16le [dest_i ] = mem .nativeToLittle (u16 , high );
599
- utf16le [dest_i + 1 ] = mem .nativeToLittle (u16 , low );
600
- dest_i += 2 ;
601
- }
602
- src_i = next_src_i ;
603
- },
604
- else = > return error .InvalidUtf8 ,
579
+ const n = utf8ByteSequenceLength (utf8 [src_i ]) catch return error .InvalidUtf8 ;
580
+ const next_src_i = src_i + n ;
581
+ const codepoint = utf8Decode (utf8 [src_i .. next_src_i ]) catch return error .InvalidUtf8 ;
582
+ if (codepoint < 0x10000 ) {
583
+ const short = @intCast (u16 , codepoint );
584
+ utf16le [dest_i ] = mem .nativeToLittle (u16 , short );
585
+ dest_i += 1 ;
586
+ } else {
587
+ const high = @intCast (u16 , (codepoint - 0x10000 ) >> 10 ) + 0xD800 ;
588
+ const low = @intCast (u16 , codepoint & 0x3FF ) + 0xDC00 ;
589
+ utf16le [dest_i ] = mem .nativeToLittle (u16 , high );
590
+ utf16le [dest_i + 1 ] = mem .nativeToLittle (u16 , low );
591
+ dest_i += 2 ;
605
592
}
593
+ src_i = next_src_i ;
606
594
}
607
595
return dest_i ;
608
596
}
609
597
610
598
test "utf8ToUtf16Le" {
611
599
var utf16le : [2 ]u16 = [_ ]u16 {0 } ** 2 ;
612
- const length = try utf8ToUtf16Le (utf16le [0.. ], "𐐷" );
613
- testing .expect (@as (usize , 2 ) == length );
614
- testing .expectEqualSlices (u8 , "\x01\xd8\x37\xdc " , @sliceToBytes (utf16le [0.. ]));
600
+ {
601
+ const length = try utf8ToUtf16Le (utf16le [0.. ], "𐐷" );
602
+ testing .expectEqual (@as (usize , 2 ), length );
603
+ testing .expectEqualSlices (u8 , "\x01\xd8\x37\xdc " , @sliceToBytes (utf16le [0.. ]));
604
+ }
605
+ {
606
+ const length = try utf8ToUtf16Le (utf16le [0.. ], "\u{10FFFF} " );
607
+ testing .expectEqual (@as (usize , 2 ), length );
608
+ testing .expectEqualSlices (u8 , "\xff\xdb\xff\xdf " , @sliceToBytes (utf16le [0.. ]));
609
+ }
615
610
}
616
611
617
612
test "utf8ToUtf16LeWithNull" {
618
- var bytes : [128 ]u8 = undefined ;
619
- const allocator = & std .heap .FixedBufferAllocator .init (bytes [0.. ]).allocator ;
620
- const utf16 = try utf8ToUtf16LeWithNull (allocator , "𐐷" );
621
- testing .expectEqualSlices (u8 , "\x01\xd8\x37\xdc\x00\x00 " , @sliceToBytes (utf16 [0.. ]));
613
+ {
614
+ var bytes : [128 ]u8 = undefined ;
615
+ const allocator = & std .heap .FixedBufferAllocator .init (bytes [0.. ]).allocator ;
616
+ const utf16 = try utf8ToUtf16LeWithNull (allocator , "𐐷" );
617
+ testing .expectEqualSlices (u8 , "\x01\xd8\x37\xdc\x00\x00 " , @sliceToBytes (utf16 [0.. ]));
618
+ }
619
+ {
620
+ var bytes : [128 ]u8 = undefined ;
621
+ const allocator = & std .heap .FixedBufferAllocator .init (bytes [0.. ]).allocator ;
622
+ const utf16 = try utf8ToUtf16LeWithNull (allocator , "\u{10FFFF} " );
623
+ testing .expectEqualSlices (u8 , "\xff\xdb\xff\xdf\x00\x00 " , @sliceToBytes (utf16 [0.. ]));
624
+ }
622
625
}
0 commit comments