Skip to content

Commit 08d595b

Browse files
tiehuisthejoshwolfe
authored andcommitted
Add utf8 string view
1 parent 8db7a14 commit 08d595b

File tree

1 file changed

+140
-9
lines changed

1 file changed

+140
-9
lines changed

std/unicode.zig

Lines changed: 140 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
const std = @import("./index.zig");
2+
const debug = std.debug;
23

34
/// Given the first byte of a UTF-8 codepoint,
45
/// returns a number 1-4 indicating the total length of the codepoint in bytes.
@@ -25,8 +26,8 @@ pub fn utf8Decode(bytes: []const u8) !u32 {
2526
};
2627
}
2728
pub fn utf8Decode2(bytes: []const u8) !u32 {
28-
std.debug.assert(bytes.len == 2);
29-
std.debug.assert(bytes[0] & 0b11100000 == 0b11000000);
29+
debug.assert(bytes.len == 2);
30+
debug.assert(bytes[0] & 0b11100000 == 0b11000000);
3031
var value: u32 = bytes[0] & 0b00011111;
3132

3233
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
@@ -38,8 +39,8 @@ pub fn utf8Decode2(bytes: []const u8) !u32 {
3839
return value;
3940
}
4041
pub fn utf8Decode3(bytes: []const u8) !u32 {
41-
std.debug.assert(bytes.len == 3);
42-
std.debug.assert(bytes[0] & 0b11110000 == 0b11100000);
42+
debug.assert(bytes.len == 3);
43+
debug.assert(bytes[0] & 0b11110000 == 0b11100000);
4344
var value: u32 = bytes[0] & 0b00001111;
4445

4546
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
@@ -56,8 +57,8 @@ pub fn utf8Decode3(bytes: []const u8) !u32 {
5657
return value;
5758
}
5859
pub fn utf8Decode4(bytes: []const u8) !u32 {
59-
std.debug.assert(bytes.len == 4);
60-
std.debug.assert(bytes[0] & 0b11111000 == 0b11110000);
60+
debug.assert(bytes.len == 4);
61+
debug.assert(bytes[0] & 0b11111000 == 0b11110000);
6162
var value: u32 = bytes[0] & 0b00000111;
6263

6364
if (bytes[1] & 0b11000000 != 0b10000000) return error.Utf8ExpectedContinuation;
@@ -78,6 +79,136 @@ pub fn utf8Decode4(bytes: []const u8) !u32 {
7879
return value;
7980
}
8081

82+
pub fn utf8ValidateSlice(s: []const u8) bool {
83+
var i: usize = 0;
84+
while (i < s.len) {
85+
if (utf8ByteSequenceLength(s[i])) |cp_len| {
86+
if (i + cp_len > s.len) {
87+
return false;
88+
}
89+
90+
if (utf8Decode(s[i..i+cp_len])) |_| {} else |_| { return false; }
91+
i += cp_len;
92+
} else |err| {
93+
return false;
94+
}
95+
}
96+
return true;
97+
}
98+
99+
const Utf8View = struct {
100+
bytes: []const u8,
101+
102+
pub fn init(s: []const u8) !Utf8View {
103+
if (!utf8ValidateSlice(s)) {
104+
return error.InvalidUtf8;
105+
}
106+
107+
return initUnchecked(s);
108+
}
109+
110+
pub fn initUnchecked(s: []const u8) Utf8View {
111+
return Utf8View {
112+
.bytes = s,
113+
};
114+
}
115+
116+
pub fn initComptime(comptime s: []const u8) Utf8View {
117+
if (comptime init(s)) |r| {
118+
return r;
119+
} else |err| switch (err) {
120+
error.InvalidUtf8 => {
121+
@compileError("invalid utf8");
122+
unreachable;
123+
}
124+
}
125+
}
126+
127+
pub fn Iterator(s: &const Utf8View) Utf8Iterator {
128+
return Utf8Iterator {
129+
.bytes = s.bytes,
130+
.i = 0,
131+
};
132+
}
133+
};
134+
135+
const Utf8Iterator = struct {
136+
bytes: []const u8,
137+
i: usize,
138+
139+
pub fn nextCodepointSlice(it: &Utf8Iterator) ?[]const u8 {
140+
if (it.i >= it.bytes.len) {
141+
return null;
142+
}
143+
144+
const cp_len = utf8ByteSequenceLength(it.bytes[it.i]) catch unreachable;
145+
146+
it.i += cp_len;
147+
return it.bytes[it.i-cp_len..it.i];
148+
}
149+
150+
pub fn nextCodepoint(it: &Utf8Iterator) ?u32 {
151+
const slice = it.nextCodepointSlice() ?? return null;
152+
153+
const r = switch (slice.len) {
154+
1 => u32(slice[0]),
155+
2 => utf8Decode2(slice),
156+
3 => utf8Decode3(slice),
157+
4 => utf8Decode4(slice),
158+
else => unreachable,
159+
};
160+
161+
return r catch unreachable;
162+
}
163+
};
164+
165+
test "utf8 iterator on ascii" {
166+
const s = Utf8View.initComptime("abc");
167+
168+
var it1 = s.Iterator();
169+
debug.assert(std.mem.eql(u8, "a", ??it1.nextCodepointSlice()));
170+
debug.assert(std.mem.eql(u8, "b", ??it1.nextCodepointSlice()));
171+
debug.assert(std.mem.eql(u8, "c", ??it1.nextCodepointSlice()));
172+
debug.assert(it1.nextCodepointSlice() == null);
173+
174+
var it2 = s.Iterator();
175+
debug.assert(??it2.nextCodepoint() == 'a');
176+
debug.assert(??it2.nextCodepoint() == 'b');
177+
debug.assert(??it2.nextCodepoint() == 'c');
178+
debug.assert(it2.nextCodepoint() == null);
179+
}
180+
181+
test "utf8 view bad" {
182+
// Compile-time error.
183+
// const s3 = Utf8View.initComptime("\xfe\xf2");
184+
185+
const s = Utf8View.init("hel\xadlo");
186+
if (s) |_| { unreachable; } else |err| { debug.assert(err == error.InvalidUtf8); }
187+
}
188+
189+
test "utf8 view ok" {
190+
const s = Utf8View.initComptime("東京市");
191+
192+
var it1 = s.Iterator();
193+
debug.assert(std.mem.eql(u8, "東", ??it1.nextCodepointSlice()));
194+
debug.assert(std.mem.eql(u8, "京", ??it1.nextCodepointSlice()));
195+
debug.assert(std.mem.eql(u8, "市", ??it1.nextCodepointSlice()));
196+
debug.assert(it1.nextCodepointSlice() == null);
197+
198+
var it2 = s.Iterator();
199+
debug.assert(??it2.nextCodepoint() == 0x6771);
200+
debug.assert(??it2.nextCodepoint() == 0x4eac);
201+
debug.assert(??it2.nextCodepoint() == 0x5e02);
202+
debug.assert(it2.nextCodepoint() == null);
203+
}
204+
205+
test "bad utf8 slice" {
206+
debug.assert(utf8ValidateSlice("abc"));
207+
debug.assert(!utf8ValidateSlice("abc\xc0"));
208+
debug.assert(!utf8ValidateSlice("abc\xc0abc"));
209+
debug.assert(utf8ValidateSlice("abc\xdf\xbf"));
210+
}
211+
81212
test "valid utf8" {
82213
testValid("\x00", 0x0);
83214
testValid("\x20", 0x20);
@@ -145,17 +276,17 @@ fn testError(bytes: []const u8, expected_err: error) void {
145276
if (testDecode(bytes)) |_| {
146277
unreachable;
147278
} else |err| {
148-
std.debug.assert(err == expected_err);
279+
debug.assert(err == expected_err);
149280
}
150281
}
151282

152283
fn testValid(bytes: []const u8, expected_codepoint: u32) void {
153-
std.debug.assert((testDecode(bytes) catch unreachable) == expected_codepoint);
284+
debug.assert((testDecode(bytes) catch unreachable) == expected_codepoint);
154285
}
155286

156287
fn testDecode(bytes: []const u8) !u32 {
157288
const length = try utf8ByteSequenceLength(bytes[0]);
158289
if (bytes.len < length) return error.UnexpectedEof;
159-
std.debug.assert(bytes.len == length);
290+
debug.assert(bytes.len == length);
160291
return utf8Decode(bytes);
161292
}

0 commit comments

Comments
 (0)