1
1
const std = @import ("./index.zig" );
2
+ const debug = std .debug ;
2
3
3
4
/// Given the first byte of a UTF-8 codepoint,
4
5
/// returns a number 1-4 indicating the total length of the codepoint in bytes.
@@ -25,8 +26,8 @@ pub fn utf8Decode(bytes: []const u8) !u32 {
25
26
};
26
27
}
27
28
pub fn utf8Decode2 (bytes : []const u8 ) ! u32 {
28
- std . debug .assert (bytes .len == 2 );
29
- std . debug .assert (bytes [0 ] & 0b11100000 == 0b11000000 );
29
+ debug .assert (bytes .len == 2 );
30
+ debug .assert (bytes [0 ] & 0b11100000 == 0b11000000 );
30
31
var value : u32 = bytes [0 ] & 0b00011111 ;
31
32
32
33
if (bytes [1 ] & 0b11000000 != 0b10000000 ) return error .Utf8ExpectedContinuation ;
@@ -38,8 +39,8 @@ pub fn utf8Decode2(bytes: []const u8) !u32 {
38
39
return value ;
39
40
}
40
41
pub fn utf8Decode3 (bytes : []const u8 ) ! u32 {
41
- std . debug .assert (bytes .len == 3 );
42
- std . debug .assert (bytes [0 ] & 0b11110000 == 0b11100000 );
42
+ debug .assert (bytes .len == 3 );
43
+ debug .assert (bytes [0 ] & 0b11110000 == 0b11100000 );
43
44
var value : u32 = bytes [0 ] & 0b00001111 ;
44
45
45
46
if (bytes [1 ] & 0b11000000 != 0b10000000 ) return error .Utf8ExpectedContinuation ;
@@ -56,8 +57,8 @@ pub fn utf8Decode3(bytes: []const u8) !u32 {
56
57
return value ;
57
58
}
58
59
pub fn utf8Decode4 (bytes : []const u8 ) ! u32 {
59
- std . debug .assert (bytes .len == 4 );
60
- std . debug .assert (bytes [0 ] & 0b11111000 == 0b11110000 );
60
+ debug .assert (bytes .len == 4 );
61
+ debug .assert (bytes [0 ] & 0b11111000 == 0b11110000 );
61
62
var value : u32 = bytes [0 ] & 0b00000111 ;
62
63
63
64
if (bytes [1 ] & 0b11000000 != 0b10000000 ) return error .Utf8ExpectedContinuation ;
@@ -78,6 +79,136 @@ pub fn utf8Decode4(bytes: []const u8) !u32 {
78
79
return value ;
79
80
}
80
81
82
+ pub fn utf8ValidateSlice (s : []const u8 ) bool {
83
+ var i : usize = 0 ;
84
+ while (i < s .len ) {
85
+ if (utf8ByteSequenceLength (s [i ])) | cp_len | {
86
+ if (i + cp_len > s .len ) {
87
+ return false ;
88
+ }
89
+
90
+ if (utf8Decode (s [i .. i + cp_len ])) | _ | {} else | _ | { return false ; }
91
+ i += cp_len ;
92
+ } else | err | {
93
+ return false ;
94
+ }
95
+ }
96
+ return true ;
97
+ }
98
+
99
+ const Utf8View = struct {
100
+ bytes : []const u8 ,
101
+
102
+ pub fn init (s : []const u8 ) ! Utf8View {
103
+ if (! utf8ValidateSlice (s )) {
104
+ return error .InvalidUtf8 ;
105
+ }
106
+
107
+ return initUnchecked (s );
108
+ }
109
+
110
+ pub fn initUnchecked (s : []const u8 ) Utf8View {
111
+ return Utf8View {
112
+ .bytes = s ,
113
+ };
114
+ }
115
+
116
+ pub fn initComptime (comptime s : []const u8 ) Utf8View {
117
+ if (comptime init (s )) | r | {
118
+ return r ;
119
+ } else | err | switch (err ) {
120
+ error .InvalidUtf8 = > {
121
+ @compileError ("invalid utf8" );
122
+ unreachable ;
123
+ }
124
+ }
125
+ }
126
+
127
+ pub fn Iterator (s : & const Utf8View ) Utf8Iterator {
128
+ return Utf8Iterator {
129
+ .bytes = s .bytes ,
130
+ .i = 0 ,
131
+ };
132
+ }
133
+ };
134
+
135
+ const Utf8Iterator = struct {
136
+ bytes : []const u8 ,
137
+ i : usize ,
138
+
139
+ pub fn nextCodepointSlice (it : & Utf8Iterator ) ? []const u8 {
140
+ if (it .i >= it .bytes .len ) {
141
+ return null ;
142
+ }
143
+
144
+ const cp_len = utf8ByteSequenceLength (it .bytes [it .i ]) catch unreachable ;
145
+
146
+ it .i += cp_len ;
147
+ return it .bytes [it .i - cp_len .. it .i ];
148
+ }
149
+
150
+ pub fn nextCodepoint (it : & Utf8Iterator ) ? u32 {
151
+ const slice = it .nextCodepointSlice () ?? return null ;
152
+
153
+ const r = switch (slice .len ) {
154
+ 1 = > u32 (slice [0 ]),
155
+ 2 = > utf8Decode2 (slice ),
156
+ 3 = > utf8Decode3 (slice ),
157
+ 4 = > utf8Decode4 (slice ),
158
+ else = > unreachable ,
159
+ };
160
+
161
+ return r catch unreachable ;
162
+ }
163
+ };
164
+
165
+ test "utf8 iterator on ascii" {
166
+ const s = Utf8View .initComptime ("abc" );
167
+
168
+ var it1 = s .Iterator ();
169
+ debug .assert (std .mem .eql (u8 , "a" , ?? it1 .nextCodepointSlice ()));
170
+ debug .assert (std .mem .eql (u8 , "b" , ?? it1 .nextCodepointSlice ()));
171
+ debug .assert (std .mem .eql (u8 , "c" , ?? it1 .nextCodepointSlice ()));
172
+ debug .assert (it1 .nextCodepointSlice () == null );
173
+
174
+ var it2 = s .Iterator ();
175
+ debug .assert (?? it2 .nextCodepoint () == 'a' );
176
+ debug .assert (?? it2 .nextCodepoint () == 'b' );
177
+ debug .assert (?? it2 .nextCodepoint () == 'c' );
178
+ debug .assert (it2 .nextCodepoint () == null );
179
+ }
180
+
181
+ test "utf8 view bad" {
182
+ // Compile-time error.
183
+ // const s3 = Utf8View.initComptime("\xfe\xf2");
184
+
185
+ const s = Utf8View .init ("hel\xad lo" );
186
+ if (s ) | _ | { unreachable ; } else | err | { debug .assert (err == error .InvalidUtf8 ); }
187
+ }
188
+
189
+ test "utf8 view ok" {
190
+ const s = Utf8View .initComptime ("東京市" );
191
+
192
+ var it1 = s .Iterator ();
193
+ debug .assert (std .mem .eql (u8 , "東" , ?? it1 .nextCodepointSlice ()));
194
+ debug .assert (std .mem .eql (u8 , "京" , ?? it1 .nextCodepointSlice ()));
195
+ debug .assert (std .mem .eql (u8 , "市" , ?? it1 .nextCodepointSlice ()));
196
+ debug .assert (it1 .nextCodepointSlice () == null );
197
+
198
+ var it2 = s .Iterator ();
199
+ debug .assert (?? it2 .nextCodepoint () == 0x6771 );
200
+ debug .assert (?? it2 .nextCodepoint () == 0x4eac );
201
+ debug .assert (?? it2 .nextCodepoint () == 0x5e02 );
202
+ debug .assert (it2 .nextCodepoint () == null );
203
+ }
204
+
205
+ test "bad utf8 slice" {
206
+ debug .assert (utf8ValidateSlice ("abc" ));
207
+ debug .assert (! utf8ValidateSlice ("abc\xc0 " ));
208
+ debug .assert (! utf8ValidateSlice ("abc\xc0 abc" ));
209
+ debug .assert (utf8ValidateSlice ("abc\xdf\xbf " ));
210
+ }
211
+
81
212
test "valid utf8" {
82
213
testValid ("\x00 " , 0x0 );
83
214
testValid ("\x20 " , 0x20 );
@@ -145,17 +276,17 @@ fn testError(bytes: []const u8, expected_err: error) void {
145
276
if (testDecode (bytes )) | _ | {
146
277
unreachable ;
147
278
} else | err | {
148
- std . debug .assert (err == expected_err );
279
+ debug .assert (err == expected_err );
149
280
}
150
281
}
151
282
152
283
fn testValid (bytes : []const u8 , expected_codepoint : u32 ) void {
153
- std . debug .assert ((testDecode (bytes ) catch unreachable ) == expected_codepoint );
284
+ debug .assert ((testDecode (bytes ) catch unreachable ) == expected_codepoint );
154
285
}
155
286
156
287
fn testDecode (bytes : []const u8 ) ! u32 {
157
288
const length = try utf8ByteSequenceLength (bytes [0 ]);
158
289
if (bytes .len < length ) return error .UnexpectedEof ;
159
- std . debug .assert (bytes .len == length );
290
+ debug .assert (bytes .len == length );
160
291
return utf8Decode (bytes );
161
292
}
0 commit comments