diff --git a/lib/std/ascii.zig b/lib/std/ascii.zig index 8174361800f5..cd8b14e98fcd 100644 --- a/lib/std/ascii.zig +++ b/lib/std/ascii.zig @@ -1,54 +1,164 @@ -// Does NOT look at the locale the way C89's toupper(3), isspace() et cetera does. -// I could have taken only a u7 to make this clear, but it would be slower -// It is my opinion that encodings other than UTF-8 should not be supported. -// -// (and 128 bytes is not much to pay). -// Also does not handle Unicode character classes. -// -// https://upload.wikimedia.org/wikipedia/commons/thumb/c/cf/USASCII_code_chart.png/1200px-USASCII_code_chart.png +//! The 7-bit [ASCII](https://en.wikipedia.org/wiki/ASCII) character encoding standard. +//! +//! This is not to be confused with the 8-bit [extended ASCII](https://en.wikipedia.org/wiki/Extended_ASCII) character encoding. +//! +//! Even though this module concerns itself with 7-bit ASCII, +//! functions use `u8` as the type instead of `u7` for convenience and compatibility. +//! Characters outside of the 7-bit range are gracefully handled (e.g. by returning `false`). +//! +//! See also: https://en.wikipedia.org/wiki/ASCII#Character_set const std = @import("std"); -/// Contains constants for the C0 control codes of the ASCII encoding. -/// https://en.wikipedia.org/wiki/C0_and_C1_control_codes +// TODO: remove all decls marked as DEPRECATED after 0.10.0's release + +/// The C0 control codes of the ASCII encoding. +/// +/// See also: https://en.wikipedia.org/wiki/C0_and_C1_control_codes and `isControl`. pub const control_code = struct { + // DEPRECATED: use the lowercase variant pub const NUL = 0x00; + // DEPRECATED: use the lowercase variant pub const SOH = 0x01; + // DEPRECATED: use the lowercase variant pub const STX = 0x02; + // DEPRECATED: use the lowercase variant pub const ETX = 0x03; + // DEPRECATED: use the lowercase variant pub const EOT = 0x04; + // DEPRECATED: use the lowercase variant pub const ENQ = 0x05; + // DEPRECATED: use the lowercase variant pub const ACK = 0x06; + // DEPRECATED: use the lowercase variant pub const BEL = 0x07; + // DEPRECATED: use the lowercase variant pub const BS = 0x08; + // DEPRECATED: use `ht` pub const TAB = 0x09; + // DEPRECATED: use the lowercase variant pub const LF = 0x0A; + // DEPRECATED: use the lowercase variant pub const VT = 0x0B; + // DEPRECATED: use the lowercase variant pub const FF = 0x0C; + // DEPRECATED: use the lowercase variant pub const CR = 0x0D; + // DEPRECATED: use the lowercase variant pub const SO = 0x0E; + // DEPRECATED: use the lowercase variant pub const SI = 0x0F; + // DEPRECATED: use the lowercase variant pub const DLE = 0x10; + // DEPRECATED: use the lowercase variant pub const DC1 = 0x11; + // DEPRECATED: use the lowercase variant pub const DC2 = 0x12; + // DEPRECATED: use the lowercase variant pub const DC3 = 0x13; + // DEPRECATED: use the lowercase variant pub const DC4 = 0x14; + // DEPRECATED: use the lowercase variant pub const NAK = 0x15; + // DEPRECATED: use the lowercase variant pub const SYN = 0x16; + // DEPRECATED: use the lowercase variant pub const ETB = 0x17; + // DEPRECATED: use the lowercase variant pub const CAN = 0x18; + // DEPRECATED: use the lowercase variant pub const EM = 0x19; + // DEPRECATED: use the lowercase variant pub const SUB = 0x1A; + // DEPRECATED: use the lowercase variant pub const ESC = 0x1B; + // DEPRECATED: use the lowercase variant pub const FS = 0x1C; + // DEPRECATED: use the lowercase variant pub const GS = 0x1D; + // DEPRECATED: use the lowercase variant pub const RS = 0x1E; + // DEPRECATED: use the lowercase variant pub const US = 0x1F; - + // DEPRECATED: use the lowercase variant pub const DEL = 0x7F; - + // DEPRECATED: use the lowercase variant pub const XON = 0x11; + // DEPRECATED: use the lowercase variant pub const XOFF = 0x13; + + /// Null. + pub const nul = 0x00; + /// Start of Heading. + pub const soh = 0x01; + /// Start of Text. + pub const stx = 0x02; + /// End of Text. + pub const etx = 0x03; + /// End of Transmission. + pub const eot = 0x04; + /// Enquiry. + pub const enq = 0x05; + /// Acknowledge. + pub const ack = 0x06; + /// Bell, Alert. + pub const bel = 0x07; + /// Backspace. + pub const bs = 0x08; + /// Horizontal Tab, Tab ('\t'). + pub const ht = 0x09; + /// Line Feed, Newline ('\n'). + pub const lf = 0x0A; + /// Vertical Tab. + pub const vt = 0x0B; + /// Form Feed. + pub const ff = 0x0C; + /// Carriage Return ('\r'). + pub const cr = 0x0D; + /// Shift Out. + pub const so = 0x0E; + /// Shift In. + pub const si = 0x0F; + /// Data Link Escape. + pub const dle = 0x10; + /// Device Control One (XON). + pub const dc1 = 0x11; + /// Device Control Two. + pub const dc2 = 0x12; + /// Device Control Three (XOFF). + pub const dc3 = 0x13; + /// Device Control Four. + pub const dc4 = 0x14; + /// Negative Acknowledge. + pub const nak = 0x15; + /// Synchronous Idle. + pub const syn = 0x16; + /// End of Transmission Block + pub const etb = 0x17; + /// Cancel. + pub const can = 0x18; + /// End of Medium. + pub const em = 0x19; + /// Substitute. + pub const sub = 0x1A; + /// Escape. + pub const esc = 0x1B; + /// File Separator. + pub const fs = 0x1C; + /// Group Separator. + pub const gs = 0x1D; + /// Record Separator. + pub const rs = 0x1E; + /// Unit Separator. + pub const us = 0x1F; + + /// Delete. + pub const del = 0x7F; + + /// An alias to `dc1`. + pub const xon = dc1; + /// An alias to `dc3`. + pub const xoff = dc3; }; const tIndex = enum(u3) { @@ -188,73 +298,106 @@ fn inTable(c: u8, t: tIndex) bool { return (combinedTable[c] & (@as(u8, 1) << @enumToInt(t))) != 0; } -pub fn isAlNum(c: u8) bool { +/// DEPRECATED: use `isAlphanumeric` +pub const isAlNum = isAlphanumeric; +/// DEPRECATED: use `isAlpha` +pub const isAlpha = isAlphabetic; +/// DEPRECATED: use `isAlpha` +pub const isCntrl = isControl; +/// DEPRECATED: use `isWhitespace`. +pub const isSpace = isWhitespace; +/// DEPRECATED: use `whitespace`. +pub const spaces = whitespace; +/// DEPRECATED: use `isHex`. +pub const isXDigit = isHex; + +/// Returns whether the character is alphanumeric. +pub fn isAlphanumeric(c: u8) bool { return (combinedTable[c] & ((@as(u8, 1) << @enumToInt(tIndex.Alpha)) | @as(u8, 1) << @enumToInt(tIndex.Digit))) != 0; } -pub fn isAlpha(c: u8) bool { +/// Returns whether the character is alphabetic. +pub fn isAlphabetic(c: u8) bool { return inTable(c, tIndex.Alpha); } -pub fn isCntrl(c: u8) bool { - return c < 0x20 or c == 127; //DEL +/// Returns whether the character is a control character. +/// This is the same as `!isPrint(c)`. +/// +/// See also: `control_code`. +pub fn isControl(c: u8) bool { + return c <= control_code.us or c == control_code.del; } +/// Returns whether the character is a digit. pub fn isDigit(c: u8) bool { return inTable(c, tIndex.Digit); } +/// DEPRECATED: use `isPrint(c) and c != ' '` instead pub fn isGraph(c: u8) bool { return inTable(c, tIndex.Graph); } +/// Returns whether the character is a lowercased letter. pub fn isLower(c: u8) bool { return inTable(c, tIndex.Lower); } +/// Returns whether the character has some graphical representation and can be printed. +/// This also returns `true` for the space character. +/// This is the same as `!isControl(c)`. pub fn isPrint(c: u8) bool { return inTable(c, tIndex.Graph) or c == ' '; } +/// DEPRECATED: create your own function based on your needs and what you want to do. pub fn isPunct(c: u8) bool { return inTable(c, tIndex.Punct); } -pub fn isSpace(c: u8) bool { +/// Returns whether this character is included in `whitespace`. +pub fn isWhitespace(c: u8) bool { return inTable(c, tIndex.Space); } -/// All the values for which isSpace() returns true. This may be used with -/// e.g. std.mem.trim() to trim whiteSpace. -pub const spaces = [_]u8{ ' ', '\t', '\n', '\r', control_code.VT, control_code.FF }; +/// Whitespace for general use. +/// This may be used with e.g. `std.mem.trim` to trim whitespace. +/// +/// See also: `isWhitespace`. +pub const whitespace = [_]u8{ ' ', '\t', '\n', '\r', control_code.vt, control_code.ff }; -test "spaces" { - const testing = std.testing; - for (spaces) |space| try testing.expect(isSpace(space)); +test "whitespace" { + for (whitespace) |char| try std.testing.expect(isWhitespace(char)); var i: u8 = 0; while (isASCII(i)) : (i += 1) { - if (isSpace(i)) try testing.expect(std.mem.indexOfScalar(u8, &spaces, i) != null); + if (isWhitespace(i)) try std.testing.expect(std.mem.indexOfScalar(u8, &whitespace, i) != null); } } +/// Returns whether the character is an uppercased letter. pub fn isUpper(c: u8) bool { return inTable(c, tIndex.Upper); } -pub fn isXDigit(c: u8) bool { +/// Returns whether the character is a hexadecimal digit. This is case-insensitive. +pub fn isHex(c: u8) bool { return inTable(c, tIndex.Hex); } +/// Returns whether the character is a 7-bit ASCII character. pub fn isASCII(c: u8) bool { return c < 128; } +/// DEPRECATED: use `c == ' ' or c == '\t'` or try `isWhitespace` pub fn isBlank(c: u8) bool { return (c == ' ') or (c == '\x09'); } +/// Uppercases the character and returns it as-is if it's already uppercased or not a letter. pub fn toUpper(c: u8) u8 { if (isLower(c)) { return c & 0b11011111; @@ -263,6 +406,7 @@ pub fn toUpper(c: u8) u8 { } } +/// Lowercases the character and returns it as-is if it's already lowercased or not a letter. pub fn toLower(c: u8) u8 { if (isUpper(c)) { return c | 0b00100000; @@ -274,13 +418,50 @@ pub fn toLower(c: u8) u8 { test "ascii character classes" { const testing = std.testing; + try testing.expect(!isControl('a')); + try testing.expect(!isControl('z')); + try testing.expect(isControl(control_code.nul)); + try testing.expect(isControl(control_code.ff)); + try testing.expect(isControl(control_code.us)); + try testing.expect('C' == toUpper('c')); try testing.expect(':' == toUpper(':')); try testing.expect('\xab' == toUpper('\xab')); + try testing.expect(!isUpper('z')); + try testing.expect('c' == toLower('C')); + try testing.expect(':' == toLower(':')); + try testing.expect('\xab' == toLower('\xab')); + try testing.expect(!isLower('Z')); + + try testing.expect(isAlphanumeric('Z')); + try testing.expect(isAlphanumeric('z')); + try testing.expect(isAlphanumeric('5')); + try testing.expect(isAlphanumeric('5')); + try testing.expect(!isAlphanumeric('!')); + + try testing.expect(!isAlpha('5')); try testing.expect(isAlpha('c')); try testing.expect(!isAlpha('5')); - try testing.expect(isSpace(' ')); + + try testing.expect(isWhitespace(' ')); + try testing.expect(isWhitespace('\t')); + try testing.expect(isWhitespace('\r')); + try testing.expect(isWhitespace('\n')); + try testing.expect(!isWhitespace('.')); + + try testing.expect(!isHex('g')); + try testing.expect(isHex('b')); + try testing.expect(isHex('9')); + + try testing.expect(!isDigit('~')); + try testing.expect(isDigit('0')); + try testing.expect(isDigit('9')); + + try testing.expect(isPrint(' ')); + try testing.expect(isPrint('@')); + try testing.expect(isPrint('~')); + try testing.expect(!isPrint(control_code.esc)); } /// Writes a lower case copy of `ascii_string` to `output`. @@ -341,7 +522,7 @@ test "allocUpperString" { try std.testing.expectEqualStrings("ABCDEFGHIJKLMNOPQRST0234+💩!", result); } -/// Compares strings `a` and `b` case insensitively and returns whether they are equal. +/// Compares strings `a` and `b` case-insensitively and returns whether they are equal. pub fn eqlIgnoreCase(a: []const u8, b: []const u8) bool { if (a.len != b.len) return false; for (a) |a_c, i| { @@ -397,11 +578,10 @@ test "indexOfIgnoreCase" { try std.testing.expect(indexOfIgnoreCase("one two three FouR", "gOur") == null); try std.testing.expect(indexOfIgnoreCase("foO", "Foo").? == 0); try std.testing.expect(indexOfIgnoreCase("foo", "fool") == null); - try std.testing.expect(indexOfIgnoreCase("FOO foo", "fOo").? == 0); } -/// Compares two slices of numbers lexicographically. O(n). +/// Returns the lexicographical order of two slices. O(n). pub fn orderIgnoreCase(lhs: []const u8, rhs: []const u8) std.math.Order { const n = std.math.min(lhs.len, rhs.len); var i: usize = 0; @@ -415,8 +595,7 @@ pub fn orderIgnoreCase(lhs: []const u8, rhs: []const u8) std.math.Order { return std.math.order(lhs.len, rhs.len); } -/// Returns true if lhs < rhs, false otherwise -/// TODO rename "IgnoreCase" to "Insensitive" in this entire file. +/// Returns whether the lexicographical order of `lhs` is lower than `rhs`. pub fn lessThanIgnoreCase(lhs: []const u8, rhs: []const u8) bool { return orderIgnoreCase(lhs, rhs) == .lt; }