Skip to content

Commit c0d7f64

Browse files
authored
Merge pull request #12448 from r00ster91/ultimateascii
std.ascii: rename functions and other improvements
2 parents f9192ad + 4ea3a9b commit c0d7f64

File tree

1 file changed

+210
-31
lines changed

1 file changed

+210
-31
lines changed

lib/std/ascii.zig

+210-31
Original file line numberDiff line numberDiff line change
@@ -1,54 +1,164 @@
1-
// Does NOT look at the locale the way C89's toupper(3), isspace() et cetera does.
2-
// I could have taken only a u7 to make this clear, but it would be slower
3-
// It is my opinion that encodings other than UTF-8 should not be supported.
4-
//
5-
// (and 128 bytes is not much to pay).
6-
// Also does not handle Unicode character classes.
7-
//
8-
// https://upload.wikimedia.org/wikipedia/commons/thumb/c/cf/USASCII_code_chart.png/1200px-USASCII_code_chart.png
1+
//! The 7-bit [ASCII](https://en.wikipedia.org/wiki/ASCII) character encoding standard.
2+
//!
3+
//! This is not to be confused with the 8-bit [extended ASCII](https://en.wikipedia.org/wiki/Extended_ASCII) character encoding.
4+
//!
5+
//! Even though this module concerns itself with 7-bit ASCII,
6+
//! functions use `u8` as the type instead of `u7` for convenience and compatibility.
7+
//! Characters outside of the 7-bit range are gracefully handled (e.g. by returning `false`).
8+
//!
9+
//! See also: https://en.wikipedia.org/wiki/ASCII#Character_set
910

1011
const std = @import("std");
1112

12-
/// Contains constants for the C0 control codes of the ASCII encoding.
13-
/// https://en.wikipedia.org/wiki/C0_and_C1_control_codes
13+
// TODO: remove all decls marked as DEPRECATED after 0.10.0's release
14+
15+
/// The C0 control codes of the ASCII encoding.
16+
///
17+
/// See also: https://en.wikipedia.org/wiki/C0_and_C1_control_codes and `isControl`.
1418
pub const control_code = struct {
19+
// DEPRECATED: use the lowercase variant
1520
pub const NUL = 0x00;
21+
// DEPRECATED: use the lowercase variant
1622
pub const SOH = 0x01;
23+
// DEPRECATED: use the lowercase variant
1724
pub const STX = 0x02;
25+
// DEPRECATED: use the lowercase variant
1826
pub const ETX = 0x03;
27+
// DEPRECATED: use the lowercase variant
1928
pub const EOT = 0x04;
29+
// DEPRECATED: use the lowercase variant
2030
pub const ENQ = 0x05;
31+
// DEPRECATED: use the lowercase variant
2132
pub const ACK = 0x06;
33+
// DEPRECATED: use the lowercase variant
2234
pub const BEL = 0x07;
35+
// DEPRECATED: use the lowercase variant
2336
pub const BS = 0x08;
37+
// DEPRECATED: use `ht`
2438
pub const TAB = 0x09;
39+
// DEPRECATED: use the lowercase variant
2540
pub const LF = 0x0A;
41+
// DEPRECATED: use the lowercase variant
2642
pub const VT = 0x0B;
43+
// DEPRECATED: use the lowercase variant
2744
pub const FF = 0x0C;
45+
// DEPRECATED: use the lowercase variant
2846
pub const CR = 0x0D;
47+
// DEPRECATED: use the lowercase variant
2948
pub const SO = 0x0E;
49+
// DEPRECATED: use the lowercase variant
3050
pub const SI = 0x0F;
51+
// DEPRECATED: use the lowercase variant
3152
pub const DLE = 0x10;
53+
// DEPRECATED: use the lowercase variant
3254
pub const DC1 = 0x11;
55+
// DEPRECATED: use the lowercase variant
3356
pub const DC2 = 0x12;
57+
// DEPRECATED: use the lowercase variant
3458
pub const DC3 = 0x13;
59+
// DEPRECATED: use the lowercase variant
3560
pub const DC4 = 0x14;
61+
// DEPRECATED: use the lowercase variant
3662
pub const NAK = 0x15;
63+
// DEPRECATED: use the lowercase variant
3764
pub const SYN = 0x16;
65+
// DEPRECATED: use the lowercase variant
3866
pub const ETB = 0x17;
67+
// DEPRECATED: use the lowercase variant
3968
pub const CAN = 0x18;
69+
// DEPRECATED: use the lowercase variant
4070
pub const EM = 0x19;
71+
// DEPRECATED: use the lowercase variant
4172
pub const SUB = 0x1A;
73+
// DEPRECATED: use the lowercase variant
4274
pub const ESC = 0x1B;
75+
// DEPRECATED: use the lowercase variant
4376
pub const FS = 0x1C;
77+
// DEPRECATED: use the lowercase variant
4478
pub const GS = 0x1D;
79+
// DEPRECATED: use the lowercase variant
4580
pub const RS = 0x1E;
81+
// DEPRECATED: use the lowercase variant
4682
pub const US = 0x1F;
47-
83+
// DEPRECATED: use the lowercase variant
4884
pub const DEL = 0x7F;
49-
85+
// DEPRECATED: use the lowercase variant
5086
pub const XON = 0x11;
87+
// DEPRECATED: use the lowercase variant
5188
pub const XOFF = 0x13;
89+
90+
/// Null.
91+
pub const nul = 0x00;
92+
/// Start of Heading.
93+
pub const soh = 0x01;
94+
/// Start of Text.
95+
pub const stx = 0x02;
96+
/// End of Text.
97+
pub const etx = 0x03;
98+
/// End of Transmission.
99+
pub const eot = 0x04;
100+
/// Enquiry.
101+
pub const enq = 0x05;
102+
/// Acknowledge.
103+
pub const ack = 0x06;
104+
/// Bell, Alert.
105+
pub const bel = 0x07;
106+
/// Backspace.
107+
pub const bs = 0x08;
108+
/// Horizontal Tab, Tab ('\t').
109+
pub const ht = 0x09;
110+
/// Line Feed, Newline ('\n').
111+
pub const lf = 0x0A;
112+
/// Vertical Tab.
113+
pub const vt = 0x0B;
114+
/// Form Feed.
115+
pub const ff = 0x0C;
116+
/// Carriage Return ('\r').
117+
pub const cr = 0x0D;
118+
/// Shift Out.
119+
pub const so = 0x0E;
120+
/// Shift In.
121+
pub const si = 0x0F;
122+
/// Data Link Escape.
123+
pub const dle = 0x10;
124+
/// Device Control One (XON).
125+
pub const dc1 = 0x11;
126+
/// Device Control Two.
127+
pub const dc2 = 0x12;
128+
/// Device Control Three (XOFF).
129+
pub const dc3 = 0x13;
130+
/// Device Control Four.
131+
pub const dc4 = 0x14;
132+
/// Negative Acknowledge.
133+
pub const nak = 0x15;
134+
/// Synchronous Idle.
135+
pub const syn = 0x16;
136+
/// End of Transmission Block
137+
pub const etb = 0x17;
138+
/// Cancel.
139+
pub const can = 0x18;
140+
/// End of Medium.
141+
pub const em = 0x19;
142+
/// Substitute.
143+
pub const sub = 0x1A;
144+
/// Escape.
145+
pub const esc = 0x1B;
146+
/// File Separator.
147+
pub const fs = 0x1C;
148+
/// Group Separator.
149+
pub const gs = 0x1D;
150+
/// Record Separator.
151+
pub const rs = 0x1E;
152+
/// Unit Separator.
153+
pub const us = 0x1F;
154+
155+
/// Delete.
156+
pub const del = 0x7F;
157+
158+
/// An alias to `dc1`.
159+
pub const xon = dc1;
160+
/// An alias to `dc3`.
161+
pub const xoff = dc3;
52162
};
53163

54164
const tIndex = enum(u3) {
@@ -188,73 +298,106 @@ fn inTable(c: u8, t: tIndex) bool {
188298
return (combinedTable[c] & (@as(u8, 1) << @enumToInt(t))) != 0;
189299
}
190300

191-
pub fn isAlNum(c: u8) bool {
301+
/// DEPRECATED: use `isAlphanumeric`
302+
pub const isAlNum = isAlphanumeric;
303+
/// DEPRECATED: use `isAlpha`
304+
pub const isAlpha = isAlphabetic;
305+
/// DEPRECATED: use `isAlpha`
306+
pub const isCntrl = isControl;
307+
/// DEPRECATED: use `isWhitespace`.
308+
pub const isSpace = isWhitespace;
309+
/// DEPRECATED: use `whitespace`.
310+
pub const spaces = whitespace;
311+
/// DEPRECATED: use `isHex`.
312+
pub const isXDigit = isHex;
313+
314+
/// Returns whether the character is alphanumeric.
315+
pub fn isAlphanumeric(c: u8) bool {
192316
return (combinedTable[c] & ((@as(u8, 1) << @enumToInt(tIndex.Alpha)) |
193317
@as(u8, 1) << @enumToInt(tIndex.Digit))) != 0;
194318
}
195319

196-
pub fn isAlpha(c: u8) bool {
320+
/// Returns whether the character is alphabetic.
321+
pub fn isAlphabetic(c: u8) bool {
197322
return inTable(c, tIndex.Alpha);
198323
}
199324

200-
pub fn isCntrl(c: u8) bool {
201-
return c < 0x20 or c == 127; //DEL
325+
/// Returns whether the character is a control character.
326+
/// This is the same as `!isPrint(c)`.
327+
///
328+
/// See also: `control_code`.
329+
pub fn isControl(c: u8) bool {
330+
return c <= control_code.us or c == control_code.del;
202331
}
203332

333+
/// Returns whether the character is a digit.
204334
pub fn isDigit(c: u8) bool {
205335
return inTable(c, tIndex.Digit);
206336
}
207337

338+
/// DEPRECATED: use `isPrint(c) and c != ' '` instead
208339
pub fn isGraph(c: u8) bool {
209340
return inTable(c, tIndex.Graph);
210341
}
211342

343+
/// Returns whether the character is a lowercased letter.
212344
pub fn isLower(c: u8) bool {
213345
return inTable(c, tIndex.Lower);
214346
}
215347

348+
/// Returns whether the character has some graphical representation and can be printed.
349+
/// This also returns `true` for the space character.
350+
/// This is the same as `!isControl(c)`.
216351
pub fn isPrint(c: u8) bool {
217352
return inTable(c, tIndex.Graph) or c == ' ';
218353
}
219354

355+
/// DEPRECATED: create your own function based on your needs and what you want to do.
220356
pub fn isPunct(c: u8) bool {
221357
return inTable(c, tIndex.Punct);
222358
}
223359

224-
pub fn isSpace(c: u8) bool {
360+
/// Returns whether this character is included in `whitespace`.
361+
pub fn isWhitespace(c: u8) bool {
225362
return inTable(c, tIndex.Space);
226363
}
227364

228-
/// All the values for which isSpace() returns true. This may be used with
229-
/// e.g. std.mem.trim() to trim whiteSpace.
230-
pub const spaces = [_]u8{ ' ', '\t', '\n', '\r', control_code.VT, control_code.FF };
365+
/// Whitespace for general use.
366+
/// This may be used with e.g. `std.mem.trim` to trim whitespace.
367+
///
368+
/// See also: `isWhitespace`.
369+
pub const whitespace = [_]u8{ ' ', '\t', '\n', '\r', control_code.vt, control_code.ff };
231370

232-
test "spaces" {
233-
const testing = std.testing;
234-
for (spaces) |space| try testing.expect(isSpace(space));
371+
test "whitespace" {
372+
for (whitespace) |char| try std.testing.expect(isWhitespace(char));
235373

236374
var i: u8 = 0;
237375
while (isASCII(i)) : (i += 1) {
238-
if (isSpace(i)) try testing.expect(std.mem.indexOfScalar(u8, &spaces, i) != null);
376+
if (isWhitespace(i)) try std.testing.expect(std.mem.indexOfScalar(u8, &whitespace, i) != null);
239377
}
240378
}
241379

380+
/// Returns whether the character is an uppercased letter.
242381
pub fn isUpper(c: u8) bool {
243382
return inTable(c, tIndex.Upper);
244383
}
245384

246-
pub fn isXDigit(c: u8) bool {
385+
/// Returns whether the character is a hexadecimal digit. This is case-insensitive.
386+
pub fn isHex(c: u8) bool {
247387
return inTable(c, tIndex.Hex);
248388
}
249389

390+
/// Returns whether the character is a 7-bit ASCII character.
250391
pub fn isASCII(c: u8) bool {
251392
return c < 128;
252393
}
253394

395+
/// DEPRECATED: use `c == ' ' or c == '\t'` or try `isWhitespace`
254396
pub fn isBlank(c: u8) bool {
255397
return (c == ' ') or (c == '\x09');
256398
}
257399

400+
/// Uppercases the character and returns it as-is if it's already uppercased or not a letter.
258401
pub fn toUpper(c: u8) u8 {
259402
if (isLower(c)) {
260403
return c & 0b11011111;
@@ -263,6 +406,7 @@ pub fn toUpper(c: u8) u8 {
263406
}
264407
}
265408

409+
/// Lowercases the character and returns it as-is if it's already lowercased or not a letter.
266410
pub fn toLower(c: u8) u8 {
267411
if (isUpper(c)) {
268412
return c | 0b00100000;
@@ -274,13 +418,50 @@ pub fn toLower(c: u8) u8 {
274418
test "ascii character classes" {
275419
const testing = std.testing;
276420

421+
try testing.expect(!isControl('a'));
422+
try testing.expect(!isControl('z'));
423+
try testing.expect(isControl(control_code.nul));
424+
try testing.expect(isControl(control_code.ff));
425+
try testing.expect(isControl(control_code.us));
426+
277427
try testing.expect('C' == toUpper('c'));
278428
try testing.expect(':' == toUpper(':'));
279429
try testing.expect('\xab' == toUpper('\xab'));
430+
try testing.expect(!isUpper('z'));
431+
280432
try testing.expect('c' == toLower('C'));
433+
try testing.expect(':' == toLower(':'));
434+
try testing.expect('\xab' == toLower('\xab'));
435+
try testing.expect(!isLower('Z'));
436+
437+
try testing.expect(isAlphanumeric('Z'));
438+
try testing.expect(isAlphanumeric('z'));
439+
try testing.expect(isAlphanumeric('5'));
440+
try testing.expect(isAlphanumeric('5'));
441+
try testing.expect(!isAlphanumeric('!'));
442+
443+
try testing.expect(!isAlpha('5'));
281444
try testing.expect(isAlpha('c'));
282445
try testing.expect(!isAlpha('5'));
283-
try testing.expect(isSpace(' '));
446+
447+
try testing.expect(isWhitespace(' '));
448+
try testing.expect(isWhitespace('\t'));
449+
try testing.expect(isWhitespace('\r'));
450+
try testing.expect(isWhitespace('\n'));
451+
try testing.expect(!isWhitespace('.'));
452+
453+
try testing.expect(!isHex('g'));
454+
try testing.expect(isHex('b'));
455+
try testing.expect(isHex('9'));
456+
457+
try testing.expect(!isDigit('~'));
458+
try testing.expect(isDigit('0'));
459+
try testing.expect(isDigit('9'));
460+
461+
try testing.expect(isPrint(' '));
462+
try testing.expect(isPrint('@'));
463+
try testing.expect(isPrint('~'));
464+
try testing.expect(!isPrint(control_code.esc));
284465
}
285466

286467
/// Writes a lower case copy of `ascii_string` to `output`.
@@ -341,7 +522,7 @@ test "allocUpperString" {
341522
try std.testing.expectEqualStrings("ABCDEFGHIJKLMNOPQRST0234+💩!", result);
342523
}
343524

344-
/// Compares strings `a` and `b` case insensitively and returns whether they are equal.
525+
/// Compares strings `a` and `b` case-insensitively and returns whether they are equal.
345526
pub fn eqlIgnoreCase(a: []const u8, b: []const u8) bool {
346527
if (a.len != b.len) return false;
347528
for (a) |a_c, i| {
@@ -397,11 +578,10 @@ test "indexOfIgnoreCase" {
397578
try std.testing.expect(indexOfIgnoreCase("one two three FouR", "gOur") == null);
398579
try std.testing.expect(indexOfIgnoreCase("foO", "Foo").? == 0);
399580
try std.testing.expect(indexOfIgnoreCase("foo", "fool") == null);
400-
401581
try std.testing.expect(indexOfIgnoreCase("FOO foo", "fOo").? == 0);
402582
}
403583

404-
/// Compares two slices of numbers lexicographically. O(n).
584+
/// Returns the lexicographical order of two slices. O(n).
405585
pub fn orderIgnoreCase(lhs: []const u8, rhs: []const u8) std.math.Order {
406586
const n = std.math.min(lhs.len, rhs.len);
407587
var i: usize = 0;
@@ -415,8 +595,7 @@ pub fn orderIgnoreCase(lhs: []const u8, rhs: []const u8) std.math.Order {
415595
return std.math.order(lhs.len, rhs.len);
416596
}
417597

418-
/// Returns true if lhs < rhs, false otherwise
419-
/// TODO rename "IgnoreCase" to "Insensitive" in this entire file.
598+
/// Returns whether the lexicographical order of `lhs` is lower than `rhs`.
420599
pub fn lessThanIgnoreCase(lhs: []const u8, rhs: []const u8) bool {
421600
return orderIgnoreCase(lhs, rhs) == .lt;
422601
}

0 commit comments

Comments
 (0)