Skip to content

std.ascii: rename functions and other improvements #12448

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Oct 15, 2022
241 changes: 210 additions & 31 deletions lib/std/ascii.zig
Original file line number Diff line number Diff line change
@@ -1,54 +1,164 @@
// Does NOT look at the locale the way C89's toupper(3), isspace() et cetera does.
// I could have taken only a u7 to make this clear, but it would be slower
// It is my opinion that encodings other than UTF-8 should not be supported.
//
// (and 128 bytes is not much to pay).
// Also does not handle Unicode character classes.
//
// https://upload.wikimedia.org/wikipedia/commons/thumb/c/cf/USASCII_code_chart.png/1200px-USASCII_code_chart.png
//! The 7-bit [ASCII](https://en.wikipedia.org/wiki/ASCII) character encoding standard.
//!
//! This is not to be confused with the 8-bit [extended ASCII](https://en.wikipedia.org/wiki/Extended_ASCII) character encoding.
//!
//! Even though this module concerns itself with 7-bit ASCII,
//! functions use `u8` as the type instead of `u7` for convenience and compatibility.
//! Characters outside of the 7-bit range are gracefully handled (e.g. by returning `false`).
//!
//! See also: https://en.wikipedia.org/wiki/ASCII#Character_set

const std = @import("std");

/// Contains constants for the C0 control codes of the ASCII encoding.
/// https://en.wikipedia.org/wiki/C0_and_C1_control_codes
// TODO: remove all decls marked as DEPRECATED after 0.10.0's release

/// The C0 control codes of the ASCII encoding.
///
/// See also: https://en.wikipedia.org/wiki/C0_and_C1_control_codes and `isControl`.
pub const control_code = struct {
// DEPRECATED: use the lowercase variant
pub const NUL = 0x00;
// DEPRECATED: use the lowercase variant
pub const SOH = 0x01;
// DEPRECATED: use the lowercase variant
pub const STX = 0x02;
// DEPRECATED: use the lowercase variant
pub const ETX = 0x03;
// DEPRECATED: use the lowercase variant
pub const EOT = 0x04;
// DEPRECATED: use the lowercase variant
pub const ENQ = 0x05;
// DEPRECATED: use the lowercase variant
pub const ACK = 0x06;
// DEPRECATED: use the lowercase variant
pub const BEL = 0x07;
// DEPRECATED: use the lowercase variant
pub const BS = 0x08;
// DEPRECATED: use `ht`
pub const TAB = 0x09;
// DEPRECATED: use the lowercase variant
pub const LF = 0x0A;
// DEPRECATED: use the lowercase variant
pub const VT = 0x0B;
// DEPRECATED: use the lowercase variant
pub const FF = 0x0C;
// DEPRECATED: use the lowercase variant
pub const CR = 0x0D;
// DEPRECATED: use the lowercase variant
pub const SO = 0x0E;
// DEPRECATED: use the lowercase variant
pub const SI = 0x0F;
// DEPRECATED: use the lowercase variant
pub const DLE = 0x10;
// DEPRECATED: use the lowercase variant
pub const DC1 = 0x11;
// DEPRECATED: use the lowercase variant
pub const DC2 = 0x12;
// DEPRECATED: use the lowercase variant
pub const DC3 = 0x13;
// DEPRECATED: use the lowercase variant
pub const DC4 = 0x14;
// DEPRECATED: use the lowercase variant
pub const NAK = 0x15;
// DEPRECATED: use the lowercase variant
pub const SYN = 0x16;
// DEPRECATED: use the lowercase variant
pub const ETB = 0x17;
// DEPRECATED: use the lowercase variant
pub const CAN = 0x18;
// DEPRECATED: use the lowercase variant
pub const EM = 0x19;
// DEPRECATED: use the lowercase variant
pub const SUB = 0x1A;
// DEPRECATED: use the lowercase variant
pub const ESC = 0x1B;
// DEPRECATED: use the lowercase variant
pub const FS = 0x1C;
// DEPRECATED: use the lowercase variant
pub const GS = 0x1D;
// DEPRECATED: use the lowercase variant
pub const RS = 0x1E;
// DEPRECATED: use the lowercase variant
pub const US = 0x1F;

// DEPRECATED: use the lowercase variant
pub const DEL = 0x7F;

// DEPRECATED: use the lowercase variant
pub const XON = 0x11;
// DEPRECATED: use the lowercase variant
pub const XOFF = 0x13;

/// Null.
pub const nul = 0x00;
/// Start of Heading.
pub const soh = 0x01;
/// Start of Text.
pub const stx = 0x02;
/// End of Text.
pub const etx = 0x03;
/// End of Transmission.
pub const eot = 0x04;
/// Enquiry.
pub const enq = 0x05;
/// Acknowledge.
pub const ack = 0x06;
/// Bell, Alert.
pub const bel = 0x07;
/// Backspace.
pub const bs = 0x08;
/// Horizontal Tab, Tab ('\t').
pub const ht = 0x09;
/// Line Feed, Newline ('\n').
pub const lf = 0x0A;
/// Vertical Tab.
pub const vt = 0x0B;
/// Form Feed.
pub const ff = 0x0C;
/// Carriage Return ('\r').
pub const cr = 0x0D;
/// Shift Out.
pub const so = 0x0E;
/// Shift In.
pub const si = 0x0F;
/// Data Link Escape.
pub const dle = 0x10;
/// Device Control One (XON).
pub const dc1 = 0x11;
/// Device Control Two.
pub const dc2 = 0x12;
/// Device Control Three (XOFF).
pub const dc3 = 0x13;
/// Device Control Four.
pub const dc4 = 0x14;
/// Negative Acknowledge.
pub const nak = 0x15;
/// Synchronous Idle.
pub const syn = 0x16;
/// End of Transmission Block
pub const etb = 0x17;
/// Cancel.
pub const can = 0x18;
/// End of Medium.
pub const em = 0x19;
/// Substitute.
pub const sub = 0x1A;
/// Escape.
pub const esc = 0x1B;
/// File Separator.
pub const fs = 0x1C;
/// Group Separator.
pub const gs = 0x1D;
/// Record Separator.
pub const rs = 0x1E;
/// Unit Separator.
pub const us = 0x1F;

/// Delete.
pub const del = 0x7F;

/// An alias to `dc1`.
pub const xon = dc1;
/// An alias to `dc3`.
pub const xoff = dc3;
};

const tIndex = enum(u3) {
Expand Down Expand Up @@ -188,73 +298,106 @@ fn inTable(c: u8, t: tIndex) bool {
return (combinedTable[c] & (@as(u8, 1) << @enumToInt(t))) != 0;
}

pub fn isAlNum(c: u8) bool {
/// DEPRECATED: use `isAlphanumeric`
pub const isAlNum = isAlphanumeric;
/// DEPRECATED: use `isAlpha`
pub const isAlpha = isAlphabetic;
/// DEPRECATED: use `isAlpha`
pub const isCntrl = isControl;
/// DEPRECATED: use `isWhitespace`.
pub const isSpace = isWhitespace;
/// DEPRECATED: use `whitespace`.
pub const spaces = whitespace;
/// DEPRECATED: use `isHex`.
pub const isXDigit = isHex;

/// Returns whether the character is alphanumeric.
pub fn isAlphanumeric(c: u8) bool {
return (combinedTable[c] & ((@as(u8, 1) << @enumToInt(tIndex.Alpha)) |
@as(u8, 1) << @enumToInt(tIndex.Digit))) != 0;
}

pub fn isAlpha(c: u8) bool {
/// Returns whether the character is alphabetic.
pub fn isAlphabetic(c: u8) bool {
return inTable(c, tIndex.Alpha);
}

pub fn isCntrl(c: u8) bool {
return c < 0x20 or c == 127; //DEL
/// Returns whether the character is a control character.
/// This is the same as `!isPrint(c)`.
///
/// See also: `control_code`.
pub fn isControl(c: u8) bool {
return c <= control_code.us or c == control_code.del;
}

/// Returns whether the character is a digit.
pub fn isDigit(c: u8) bool {
return inTable(c, tIndex.Digit);
}

/// DEPRECATED: use `isPrint(c) and c != ' '` instead
pub fn isGraph(c: u8) bool {
return inTable(c, tIndex.Graph);
}

/// Returns whether the character is a lowercased letter.
pub fn isLower(c: u8) bool {
return inTable(c, tIndex.Lower);
}

/// Returns whether the character has some graphical representation and can be printed.
/// This also returns `true` for the space character.
/// This is the same as `!isControl(c)`.
pub fn isPrint(c: u8) bool {
return inTable(c, tIndex.Graph) or c == ' ';
}

/// DEPRECATED: create your own function based on your needs and what you want to do.
pub fn isPunct(c: u8) bool {
return inTable(c, tIndex.Punct);
}

pub fn isSpace(c: u8) bool {
/// Returns whether this character is included in `whitespace`.
pub fn isWhitespace(c: u8) bool {
return inTable(c, tIndex.Space);
}

/// All the values for which isSpace() returns true. This may be used with
/// e.g. std.mem.trim() to trim whiteSpace.
pub const spaces = [_]u8{ ' ', '\t', '\n', '\r', control_code.VT, control_code.FF };
/// Whitespace for general use.
/// This may be used with e.g. `std.mem.trim` to trim whitespace.
///
/// See also: `isWhitespace`.
pub const whitespace = [_]u8{ ' ', '\t', '\n', '\r', control_code.vt, control_code.ff };

test "spaces" {
const testing = std.testing;
for (spaces) |space| try testing.expect(isSpace(space));
test "whitespace" {
for (whitespace) |char| try std.testing.expect(isWhitespace(char));

var i: u8 = 0;
while (isASCII(i)) : (i += 1) {
if (isSpace(i)) try testing.expect(std.mem.indexOfScalar(u8, &spaces, i) != null);
if (isWhitespace(i)) try std.testing.expect(std.mem.indexOfScalar(u8, &whitespace, i) != null);
}
}

/// Returns whether the character is an uppercased letter.
pub fn isUpper(c: u8) bool {
return inTable(c, tIndex.Upper);
}

pub fn isXDigit(c: u8) bool {
/// Returns whether the character is a hexadecimal digit. This is case-insensitive.
pub fn isHex(c: u8) bool {
return inTable(c, tIndex.Hex);
}

/// Returns whether the character is a 7-bit ASCII character.
pub fn isASCII(c: u8) bool {
return c < 128;
}

/// DEPRECATED: use `c == ' ' or c == '\t'` or try `isWhitespace`
pub fn isBlank(c: u8) bool {
return (c == ' ') or (c == '\x09');
}

/// Uppercases the character and returns it as-is if it's already uppercased or not a letter.
pub fn toUpper(c: u8) u8 {
if (isLower(c)) {
return c & 0b11011111;
Expand All @@ -263,6 +406,7 @@ pub fn toUpper(c: u8) u8 {
}
}

/// Lowercases the character and returns it as-is if it's already lowercased or not a letter.
pub fn toLower(c: u8) u8 {
if (isUpper(c)) {
return c | 0b00100000;
Expand All @@ -274,13 +418,50 @@ pub fn toLower(c: u8) u8 {
test "ascii character classes" {
const testing = std.testing;

try testing.expect(!isControl('a'));
try testing.expect(!isControl('z'));
try testing.expect(isControl(control_code.nul));
try testing.expect(isControl(control_code.ff));
try testing.expect(isControl(control_code.us));

try testing.expect('C' == toUpper('c'));
try testing.expect(':' == toUpper(':'));
try testing.expect('\xab' == toUpper('\xab'));
try testing.expect(!isUpper('z'));

try testing.expect('c' == toLower('C'));
try testing.expect(':' == toLower(':'));
try testing.expect('\xab' == toLower('\xab'));
try testing.expect(!isLower('Z'));

try testing.expect(isAlphanumeric('Z'));
try testing.expect(isAlphanumeric('z'));
try testing.expect(isAlphanumeric('5'));
try testing.expect(isAlphanumeric('5'));
try testing.expect(!isAlphanumeric('!'));

try testing.expect(!isAlpha('5'));
try testing.expect(isAlpha('c'));
try testing.expect(!isAlpha('5'));
try testing.expect(isSpace(' '));

try testing.expect(isWhitespace(' '));
try testing.expect(isWhitespace('\t'));
try testing.expect(isWhitespace('\r'));
try testing.expect(isWhitespace('\n'));
try testing.expect(!isWhitespace('.'));

try testing.expect(!isHex('g'));
try testing.expect(isHex('b'));
try testing.expect(isHex('9'));

try testing.expect(!isDigit('~'));
try testing.expect(isDigit('0'));
try testing.expect(isDigit('9'));

try testing.expect(isPrint(' '));
try testing.expect(isPrint('@'));
try testing.expect(isPrint('~'));
try testing.expect(!isPrint(control_code.esc));
}

/// Writes a lower case copy of `ascii_string` to `output`.
Expand Down Expand Up @@ -341,7 +522,7 @@ test "allocUpperString" {
try std.testing.expectEqualStrings("ABCDEFGHIJKLMNOPQRST0234+💩!", result);
}

/// Compares strings `a` and `b` case insensitively and returns whether they are equal.
/// Compares strings `a` and `b` case-insensitively and returns whether they are equal.
pub fn eqlIgnoreCase(a: []const u8, b: []const u8) bool {
if (a.len != b.len) return false;
for (a) |a_c, i| {
Expand Down Expand Up @@ -397,11 +578,10 @@ test "indexOfIgnoreCase" {
try std.testing.expect(indexOfIgnoreCase("one two three FouR", "gOur") == null);
try std.testing.expect(indexOfIgnoreCase("foO", "Foo").? == 0);
try std.testing.expect(indexOfIgnoreCase("foo", "fool") == null);

try std.testing.expect(indexOfIgnoreCase("FOO foo", "fOo").? == 0);
}

/// Compares two slices of numbers lexicographically. O(n).
/// Returns the lexicographical order of two slices. O(n).
pub fn orderIgnoreCase(lhs: []const u8, rhs: []const u8) std.math.Order {
const n = std.math.min(lhs.len, rhs.len);
var i: usize = 0;
Expand All @@ -415,8 +595,7 @@ pub fn orderIgnoreCase(lhs: []const u8, rhs: []const u8) std.math.Order {
return std.math.order(lhs.len, rhs.len);
}

/// Returns true if lhs < rhs, false otherwise
/// TODO rename "IgnoreCase" to "Insensitive" in this entire file.
/// Returns whether the lexicographical order of `lhs` is lower than `rhs`.
pub fn lessThanIgnoreCase(lhs: []const u8, rhs: []const u8) bool {
return orderIgnoreCase(lhs, rhs) == .lt;
}