diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 558ef63..0eba2f0 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -40,3 +40,10 @@ jobs: run: | cargo test --release --no-default-features --lib -- --include-ignored cargo test --doc --no-default-features + + - name: Build with check_suffix + run: cargo build --features=check_suffix + - name: Run tests with check_suffix + run: | + cargo test --release --features=check_suffix --lib -- --include-ignored + cargo test --doc --features=check_suffix diff --git a/Cargo.toml b/Cargo.toml index bbfd540..46d41b5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -26,6 +26,8 @@ exclude = [".github"] [features] default = ["proc-macro2"] +check_suffix = ["unicode-xid"] [dependencies] proc-macro2 = { version = "1", optional = true } +unicode-xid = { version = "0.2.4", optional = true } diff --git a/examples/procmacro/examples/main.rs b/examples/procmacro/examples/main.rs index 117d816..55ad095 100644 --- a/examples/procmacro/examples/main.rs +++ b/examples/procmacro/examples/main.rs @@ -1,4 +1,4 @@ -use procmacro_example::{concat, repeat}; +use procmacro_example::{concat, dbg_and_swallow, repeat}; const FOO: &str = concat!(r#"Hello "# '🦊' "\nHere is a friend: \u{1F427}"); // const FOO: &str = concat!(::); @@ -8,6 +8,7 @@ const BAR: &str = repeat!(3 * "నా పిల్లి లావుగా ఉ const BAZ: &str = repeat!(0b101 * "🦀"); // const BAZ: &str = repeat!(3.5 * "🦀"); +dbg_and_swallow!(16px); fn main() { println!("{}", FOO); diff --git a/examples/procmacro/src/lib.rs b/examples/procmacro/src/lib.rs index 86e167f..f6c43bc 100644 --- a/examples/procmacro/src/lib.rs +++ b/examples/procmacro/src/lib.rs @@ -3,6 +3,14 @@ use proc_macro::{Spacing, TokenStream, TokenTree}; use litrs::{Literal, IntegerLit, StringLit}; +#[proc_macro] +pub fn dbg_and_swallow(input: TokenStream) -> TokenStream { + for token in input { + println!("{} -> {:#?}", token, Literal::try_from(&token)); + } + TokenStream::new() +} + /// Concatinates all input string and char literals into a single output string /// literal. #[proc_macro] diff --git a/src/byte/mod.rs b/src/byte/mod.rs index 7c64901..ffdff5d 100644 --- a/src/byte/mod.rs +++ b/src/byte/mod.rs @@ -4,6 +4,7 @@ use crate::{ Buffer, ParseError, err::{perr, ParseErrorKind::*}, escape::unescape, + parse::check_suffix, }; @@ -15,6 +16,8 @@ use crate::{ #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct ByteLit { raw: B, + /// Start index of the suffix or `raw.len()` if there is no suffix. + start_suffix: usize, value: u8, } @@ -29,8 +32,8 @@ impl ByteLit { return Err(perr(None, InvalidByteLiteralStart)); } - let value = parse_impl(&input)?; - Ok(Self { raw: input, value }) + let (value, start_suffix) = parse_impl(&input)?; + Ok(Self { raw: input, value, start_suffix }) } /// Returns the byte value that this literal represents. @@ -38,6 +41,11 @@ impl ByteLit { self.value } + /// The optional suffix. Returns `""` if the suffix is empty/does not exist. + pub fn suffix(&self) -> &str { + &(*self.raw)[self.start_suffix..] + } + /// Returns the raw input that was passed to `parse`. pub fn raw_input(&self) -> &str { &self.raw @@ -56,6 +64,7 @@ impl ByteLit<&str> { pub fn to_owned(&self) -> ByteLit { ByteLit { raw: self.raw.to_owned(), + start_suffix: self.start_suffix, value: self.value, } } @@ -69,32 +78,29 @@ impl fmt::Display for ByteLit { /// Precondition: must start with `b'`. #[inline(never)] -pub(crate) fn parse_impl(input: &str) -> Result { - if input.len() == 2 { - return Err(perr(None, UnterminatedByteLiteral)); - } - if *input.as_bytes().last().unwrap() != b'\'' { - return Err(perr(None, UnterminatedByteLiteral)); - } - - let inner = &input[2..input.len() - 1]; - let first = inner.as_bytes().get(0).ok_or(perr(None, EmptyByteLiteral))?; +pub(crate) fn parse_impl(input: &str) -> Result<(u8, usize), ParseError> { + let input_bytes = input.as_bytes(); + let first = input_bytes.get(2).ok_or(perr(None, UnterminatedByteLiteral))?; let (c, len) = match first { - b'\'' => return Err(perr(2, UnescapedSingleQuote)), - b'\n' | b'\t' | b'\r' - => return Err(perr(2, UnescapedSpecialWhitespace)), - - b'\\' => unescape::(inner, 2)?, + b'\'' if input_bytes.get(3) == Some(&b'\'') => return Err(perr(2, UnescapedSingleQuote)), + b'\'' => return Err(perr(None, EmptyByteLiteral)), + b'\n' | b'\t' | b'\r' => return Err(perr(2, UnescapedSpecialWhitespace)), + b'\\' => unescape::(&input[2..], 2)?, other if other.is_ascii() => (*other, 1), _ => return Err(perr(2, NonAsciiInByteLiteral)), }; - let rest = &inner[len..]; - if !rest.is_empty() { - return Err(perr(len + 2..input.len() - 1, OverlongByteLiteral)); + match input[2 + len..].find('\'') { + Some(0) => {} + Some(_) => return Err(perr(None, OverlongByteLiteral)), + None => return Err(perr(None, UnterminatedByteLiteral)), } - Ok(c) + let start_suffix = 2 + len + 1; + let suffix = &input[start_suffix..]; + check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?; + + Ok((c, start_suffix)) } #[cfg(test)] diff --git a/src/byte/tests.rs b/src/byte/tests.rs index 08586b0..3cf16b5 100644 --- a/src/byte/tests.rs +++ b/src/byte/tests.rs @@ -3,16 +3,20 @@ use crate::{ByteLit, Literal, test_util::{assert_parse_ok_eq, assert_roundtrip}} // ===== Utility functions ======================================================================= macro_rules! check { - ($lit:literal) => { - let input = stringify!($lit); + ($lit:literal) => { check!($lit, stringify!($lit), "") }; + ($lit:literal, $input:expr, $suffix:literal) => { + let input = $input; let expected = ByteLit { raw: input, + start_suffix: input.len() - $suffix.len(), value: $lit, }; assert_parse_ok_eq(input, ByteLit::parse(input), expected.clone(), "ByteLit::parse"); assert_parse_ok_eq(input, Literal::parse(input), Literal::Byte(expected), "Literal::parse"); - assert_eq!(ByteLit::parse(input).unwrap().value(), $lit); + let lit = ByteLit::parse(input).unwrap(); + assert_eq!(lit.value(), $lit); + assert_eq!(lit.suffix(), $suffix); assert_roundtrip(expected.to_owned(), input); }; } @@ -113,13 +117,23 @@ fn byte_escapes() { check!(b'\xFF'); } +#[test] +fn suffixes() { + check!(b'a', r##"b'a'peter"##, "peter"); + check!(b'#', r##"b'#'peter"##, "peter"); + check!(b'\n', r##"b'\n'peter"##, "peter"); + check!(b'\'', r##"b'\''peter"##, "peter"); + check!(b'\"', r##"b'\"'peter"##, "peter"); + check!(b'\xFF', r##"b'\xFF'peter"##, "peter"); +} + #[test] fn invald_escapes() { assert_err!(ByteLit, r"b'\a'", UnknownEscape, 2..4); assert_err!(ByteLit, r"b'\y'", UnknownEscape, 2..4); - assert_err!(ByteLit, r"b'\", UnterminatedByteLiteral, None); - assert_err!(ByteLit, r"b'\x'", UnterminatedEscape, 2..4); - assert_err!(ByteLit, r"b'\x1'", UnterminatedEscape, 2..5); + assert_err!(ByteLit, r"b'\", UnterminatedEscape, 2..3); + assert_err!(ByteLit, r"b'\x'", UnterminatedEscape, 2..5); + assert_err!(ByteLit, r"b'\x1'", InvalidXEscape, 2..6); assert_err!(ByteLit, r"b'\xaj'", InvalidXEscape, 2..6); assert_err!(ByteLit, r"b'\xjb'", InvalidXEscape, 2..6); } @@ -148,16 +162,16 @@ fn unicode_escape_not_allowed() { #[test] fn parse_err() { assert_err!(ByteLit, r"b''", EmptyByteLiteral, None); - assert_err!(ByteLit, r"b' ''", OverlongByteLiteral, 3..4); + assert_err!(ByteLit, r"b' ''", UnexpectedChar, 4..5); assert_err!(ByteLit, r"b'", UnterminatedByteLiteral, None); assert_err!(ByteLit, r"b'a", UnterminatedByteLiteral, None); assert_err!(ByteLit, r"b'\n", UnterminatedByteLiteral, None); assert_err!(ByteLit, r"b'\x35", UnterminatedByteLiteral, None); - assert_err!(ByteLit, r"b'ab'", OverlongByteLiteral, 3..4); - assert_err!(ByteLit, r"b'a _'", OverlongByteLiteral, 3..5); - assert_err!(ByteLit, r"b'\n3'", OverlongByteLiteral, 4..5); + assert_err!(ByteLit, r"b'ab'", OverlongByteLiteral, None); + assert_err!(ByteLit, r"b'a _'", OverlongByteLiteral, None); + assert_err!(ByteLit, r"b'\n3'", OverlongByteLiteral, None); assert_err!(ByteLit, r"", Empty, None); diff --git a/src/bytestr/mod.rs b/src/bytestr/mod.rs index a2908b9..a0e0972 100644 --- a/src/bytestr/mod.rs +++ b/src/bytestr/mod.rs @@ -24,6 +24,9 @@ pub struct ByteStringLit { /// The number of hash signs in case of a raw string literal, or `None` if /// it's not a raw string literal. num_hashes: Option, + + /// Start index of the suffix or `raw.len()` if there is no suffix. + start_suffix: usize, } impl ByteStringLit { @@ -37,7 +40,8 @@ impl ByteStringLit { return Err(perr(None, InvalidByteStringLiteralStart)); } - Self::parse_impl(input) + let (value, num_hashes, start_suffix) = parse_impl(&input)?; + Ok(Self { raw: input, value, num_hashes, start_suffix }) } /// Returns the string value this literal represents (where all escapes have @@ -56,6 +60,11 @@ impl ByteStringLit { value.map(B::ByteCow::from).unwrap_or_else(|| raw.cut(inner_range).into_byte_cow()) } + /// The optional suffix. Returns `""` if the suffix is empty/does not exist. + pub fn suffix(&self) -> &str { + &(*self.raw)[self.start_suffix..] + } + /// Returns whether this literal is a raw string literal (starting with /// `r`). pub fn is_raw_byte_string(&self) -> bool { @@ -75,27 +84,8 @@ impl ByteStringLit { /// The range within `self.raw` that excludes the quotes and potential `r#`. fn inner_range(&self) -> Range { match self.num_hashes { - None => 2..self.raw.len() - 1, - Some(n) => 2 + n as usize + 1..self.raw.len() - n as usize - 1, - } - } - - /// Precondition: input has to start with either `b"` or `br`. - pub(crate) fn parse_impl(input: B) -> Result { - if input.starts_with(r"br") { - let (value, num_hashes) = scan_raw_string::(&input, 2)?; - Ok(Self { - raw: input, - value: value.map(|s| s.into_bytes()), - num_hashes: Some(num_hashes), - }) - } else { - let value = unescape_string::(&input, 2)?.map(|s| s.into_bytes()); - Ok(Self { - raw: input, - value, - num_hashes: None, - }) + None => 2..self.start_suffix - 1, + Some(n) => 2 + n as usize + 1..self.start_suffix - n as usize - 1, } } } @@ -108,6 +98,7 @@ impl ByteStringLit<&str> { raw: self.raw.to_owned(), value: self.value, num_hashes: self.num_hashes, + start_suffix: self.start_suffix, } } } @@ -119,5 +110,17 @@ impl fmt::Display for ByteStringLit { } +/// Precondition: input has to start with either `b"` or `br`. +#[inline(never)] +fn parse_impl(input: &str) -> Result<(Option>, Option, usize), ParseError> { + if input.starts_with("br") { + scan_raw_string::(&input, 2) + .map(|(v, num, start_suffix)| (v.map(String::into_bytes), Some(num), start_suffix)) + } else { + unescape_string::(&input, 2) + .map(|(v, start_suffix)| (v.map(String::into_bytes), None, start_suffix)) + } +} + #[cfg(test)] mod tests; diff --git a/src/bytestr/tests.rs b/src/bytestr/tests.rs index b0480fd..2afef5a 100644 --- a/src/bytestr/tests.rs +++ b/src/bytestr/tests.rs @@ -4,19 +4,25 @@ use crate::{Literal, ByteStringLit, test_util::{assert_parse_ok_eq, assert_round macro_rules! check { ($lit:literal, $has_escapes:expr, $num_hashes:expr) => { - let input = stringify!($lit); + check!($lit, stringify!($lit), $has_escapes, $num_hashes, "") + }; + ($lit:literal, $input:expr, $has_escapes:expr, $num_hashes:expr, $suffix:literal) => { + let input = $input; let expected = ByteStringLit { raw: input, value: if $has_escapes { Some($lit.to_vec()) } else { None }, num_hashes: $num_hashes, + start_suffix: input.len() - $suffix.len(), }; assert_parse_ok_eq( input, ByteStringLit::parse(input), expected.clone(), "ByteStringLit::parse"); assert_parse_ok_eq( input, Literal::parse(input), Literal::ByteString(expected.clone()), "Literal::parse"); - assert_eq!(ByteStringLit::parse(input).unwrap().value(), $lit); - assert_eq!(ByteStringLit::parse(input).unwrap().into_value().as_ref(), $lit); + let lit = ByteStringLit::parse(input).unwrap(); + assert_eq!(lit.value(), $lit); + assert_eq!(lit.suffix(), $suffix); + assert_eq!(lit.into_value().as_ref(), $lit); assert_roundtrip(expected.into_owned(), input); }; } @@ -43,6 +49,7 @@ fn special_whitespace() { raw: &*input, value: None, num_hashes, + start_suffix: input.len(), }; assert_parse_ok_eq( &input, ByteStringLit::parse(&*input), expected.clone(), "ByteStringLit::parse"); @@ -147,6 +154,14 @@ fn raw_byte_string() { check!(br#"cat\n\t\r\0\\x60\u{123}doggo"#, false, Some(1)); } +#[test] +fn suffixes() { + check!(b"hello", r###"b"hello"suffix"###, false, None, "suffix"); + check!(b"fox", r#"b"fox"peter"#, false, None, "peter"); + check!(b"a\x0cb\\", r#"b"a\x0cb\\"_jürgen"#, true, None, "_jürgen"); + check!(br"a\x0cb\\", r###"br#"a\x0cb\\"#_jürgen"###, false, Some(1), "_jürgen"); +} + #[test] fn parse_err() { assert_err!(ByteStringLit, r#"b""#, UnterminatedString, None); @@ -154,10 +169,8 @@ fn parse_err() { assert_err!(ByteStringLit, r#"b"Jurgen"#, UnterminatedString, None); assert_err!(ByteStringLit, r#"b"foo bar baz"#, UnterminatedString, None); - assert_err!(ByteStringLit, r#"b"fox"peter"#, UnexpectedChar, 6..11); - assert_err!(ByteStringLit, r#"b"fox"peter""#, UnexpectedChar, 6..12); - assert_err!(ByteStringLit, r#"b"fox"bar"#, UnexpectedChar, 6..9); - assert_err!(ByteStringLit, r###"br#"foo "# bar"#"###, UnexpectedChar, 10..16); + assert_err!(ByteStringLit, r#"b"fox"peter""#, InvalidSuffix, 6); + assert_err!(ByteStringLit, r###"br#"foo "# bar"#"###, UnexpectedChar, 10); assert_err!(ByteStringLit, "b\"\r\"", IsolatedCr, 2); assert_err!(ByteStringLit, "b\"fo\rx\"", IsolatedCr, 4); @@ -179,10 +192,10 @@ fn non_ascii() { } #[test] -fn invald_escapes() { +fn invalid_escapes() { assert_err!(ByteStringLit, r#"b"\a""#, UnknownEscape, 2..4); assert_err!(ByteStringLit, r#"b"foo\y""#, UnknownEscape, 5..7); - assert_err!(ByteStringLit, r#"b"\"#, UnterminatedString, None); + assert_err!(ByteStringLit, r#"b"\"#, UnterminatedEscape, 2); assert_err!(ByteStringLit, r#"b"\x""#, UnterminatedEscape, 2..4); assert_err!(ByteStringLit, r#"b"foo\x1""#, UnterminatedEscape, 5..8); assert_err!(ByteStringLit, r#"b" \xaj""#, InvalidXEscape, 3..7); diff --git a/src/char/mod.rs b/src/char/mod.rs index 96d5037..54f6f11 100644 --- a/src/char/mod.rs +++ b/src/char/mod.rs @@ -4,7 +4,7 @@ use crate::{ Buffer, ParseError, err::{perr, ParseErrorKind::*}, escape::unescape, - parse::first_byte_or_empty, + parse::{first_byte_or_empty, check_suffix}, }; @@ -16,6 +16,8 @@ use crate::{ #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct CharLit { raw: B, + /// Start index of the suffix or `raw.len()` if there is no suffix. + start_suffix: usize, value: char, } @@ -25,8 +27,8 @@ impl CharLit { pub fn parse(input: B) -> Result { match first_byte_or_empty(&input)? { b'\'' => { - let value = parse_impl(&input)?; - Ok(Self { raw: input, value }) + let (value, start_suffix) = parse_impl(&input)?; + Ok(Self { raw: input, value, start_suffix }) }, _ => Err(perr(0, DoesNotStartWithQuote)), } @@ -37,6 +39,11 @@ impl CharLit { self.value } + /// The optional suffix. Returns `""` if the suffix is empty/does not exist. + pub fn suffix(&self) -> &str { + &(*self.raw)[self.start_suffix..] + } + /// Returns the raw input that was passed to `parse`. pub fn raw_input(&self) -> &str { &self.raw @@ -55,6 +62,7 @@ impl CharLit<&str> { pub fn to_owned(&self) -> CharLit { CharLit { raw: self.raw.to_owned(), + start_suffix: self.start_suffix, value: self.value, } } @@ -68,31 +76,29 @@ impl fmt::Display for CharLit { /// Precondition: first character in input must be `'`. #[inline(never)] -pub(crate) fn parse_impl(input: &str) -> Result { - if input.len() == 1 { - return Err(perr(None, UnterminatedCharLiteral)); - } - if *input.as_bytes().last().unwrap() != b'\'' { - return Err(perr(None, UnterminatedCharLiteral)); - } - - let inner = &input[1..input.len() - 1]; - let first = inner.chars().nth(0).ok_or(perr(None, EmptyCharLiteral))?; +pub(crate) fn parse_impl(input: &str) -> Result<(char, usize), ParseError> { + let first = input.chars().nth(1).ok_or(perr(None, UnterminatedCharLiteral))?; let (c, len) = match first { - '\'' => return Err(perr(1, UnescapedSingleQuote)), + '\'' if input.chars().nth(2) == Some('\'') => return Err(perr(1, UnescapedSingleQuote)), + '\'' => return Err(perr(None, EmptyCharLiteral)), '\n' | '\t' | '\r' => return Err(perr(1, UnescapedSpecialWhitespace)), - '\\' => unescape::(inner, 1)?, + '\\' => unescape::(&input[1..], 1)?, other => (other, other.len_utf8()), }; - let rest = &inner[len..]; - if !rest.is_empty() { - return Err(perr(len + 1..input.len() - 1, OverlongCharLiteral)); + match input[1 + len..].find('\'') { + Some(0) => {} + Some(_) => return Err(perr(None, OverlongCharLiteral)), + None => return Err(perr(None, UnterminatedCharLiteral)), } - Ok(c) + let start_suffix = 1 + len + 1; + let suffix = &input[start_suffix..]; + check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?; + + Ok((c, start_suffix)) } #[cfg(test)] diff --git a/src/char/tests.rs b/src/char/tests.rs index bfae5e4..19219db 100644 --- a/src/char/tests.rs +++ b/src/char/tests.rs @@ -4,16 +4,20 @@ use super::CharLit; // ===== Utility functions ======================================================================= macro_rules! check { - ($lit:literal) => { - let input = stringify!($lit); + ($lit:literal) => { check!($lit, stringify!($lit), "") }; + ($lit:literal, $input:expr, $suffix:literal) => { + let input = $input; let expected = CharLit { raw: input, + start_suffix: input.len() - $suffix.len(), value: $lit, }; assert_parse_ok_eq(input, CharLit::parse(input), expected.clone(), "CharLit::parse"); assert_parse_ok_eq(input, Literal::parse(input), Literal::Char(expected), "Literal::parse"); - assert_eq!(CharLit::parse(input).unwrap().value(), $lit); + let lit = CharLit::parse(input).unwrap(); + assert_eq!(lit.value(), $lit); + assert_eq!(lit.suffix(), $suffix); assert_roundtrip(expected.to_owned(), input); }; } @@ -134,6 +138,15 @@ fn unicode_escapes() { check!('\u{1_F6_02_____}'); } +#[test] +fn suffixes() { + check!('a', r##"'a'peter"##, "peter"); + check!('#', r##"'#'peter"##, "peter"); + check!('\n', r##"'\n'peter"##, "peter"); + check!('\'', r##"'\''peter"##, "peter"); + check!('\"', r##"'\"'peter"##, "peter"); +} + #[test] fn invald_ascii_escapes() { assert_err!(CharLit, r"'\x80'", NonAsciiXEscape, 1..5); @@ -151,12 +164,12 @@ fn invald_ascii_escapes() { } #[test] -fn invald_escapes() { +fn invalid_escapes() { assert_err!(CharLit, r"'\a'", UnknownEscape, 1..3); assert_err!(CharLit, r"'\y'", UnknownEscape, 1..3); - assert_err!(CharLit, r"'\", UnterminatedCharLiteral, None); - assert_err!(CharLit, r"'\x'", UnterminatedEscape, 1..3); - assert_err!(CharLit, r"'\x1'", UnterminatedEscape, 1..4); + assert_err!(CharLit, r"'\", UnterminatedEscape, 1); + assert_err!(CharLit, r"'\x'", UnterminatedEscape, 1..4); + assert_err!(CharLit, r"'\x1'", InvalidXEscape, 1..5); assert_err!(CharLit, r"'\xaj'", InvalidXEscape, 1..5); assert_err!(CharLit, r"'\xjb'", InvalidXEscape, 1..5); } @@ -167,10 +180,10 @@ fn invalid_unicode_escapes() { assert_err!(CharLit, r"'\u '", UnicodeEscapeWithoutBrace, 1..3); assert_err!(CharLit, r"'\u3'", UnicodeEscapeWithoutBrace, 1..3); - assert_err!(CharLit, r"'\u{'", UnterminatedUnicodeEscape, 1..4); - assert_err!(CharLit, r"'\u{12'", UnterminatedUnicodeEscape, 1..6); - assert_err!(CharLit, r"'\u{a0b'", UnterminatedUnicodeEscape, 1..7); - assert_err!(CharLit, r"'\u{a0_b '", UnterminatedUnicodeEscape, 1..10); + assert_err!(CharLit, r"'\u{'", UnterminatedUnicodeEscape, 1..5); + assert_err!(CharLit, r"'\u{12'", UnterminatedUnicodeEscape, 1..7); + assert_err!(CharLit, r"'\u{a0b'", UnterminatedUnicodeEscape, 1..8); + assert_err!(CharLit, r"'\u{a0_b '", UnterminatedUnicodeEscape, 1..11); assert_err!(CharLit, r"'\u{_}'", InvalidStartOfUnicodeEscape, 4); assert_err!(CharLit, r"'\u{_5f}'", InvalidStartOfUnicodeEscape, 4); @@ -192,16 +205,16 @@ fn invalid_unicode_escapes() { #[test] fn parse_err() { assert_err!(CharLit, r"''", EmptyCharLiteral, None); - assert_err!(CharLit, r"' ''", OverlongCharLiteral, 2..3); + assert_err!(CharLit, r"' ''", UnexpectedChar, 3); assert_err!(CharLit, r"'", UnterminatedCharLiteral, None); assert_err!(CharLit, r"'a", UnterminatedCharLiteral, None); assert_err!(CharLit, r"'\n", UnterminatedCharLiteral, None); assert_err!(CharLit, r"'\x35", UnterminatedCharLiteral, None); - assert_err!(CharLit, r"'ab'", OverlongCharLiteral, 2..3); - assert_err!(CharLit, r"'a _'", OverlongCharLiteral, 2..4); - assert_err!(CharLit, r"'\n3'", OverlongCharLiteral, 3..4); + assert_err!(CharLit, r"'ab'", OverlongCharLiteral, None); + assert_err!(CharLit, r"'a _'", OverlongCharLiteral, None); + assert_err!(CharLit, r"'\n3'", OverlongCharLiteral, None); assert_err!(CharLit, r"", Empty, None); diff --git a/src/err.rs b/src/err.rs index 1011550..86d51dc 100644 --- a/src/err.rs +++ b/src/err.rs @@ -221,13 +221,6 @@ pub(crate) enum ParseErrorKind { /// Integer literal does not contain any valid digits. NoDigits, - /// Found a integer type suffix that is invalid. - InvalidIntegerTypeSuffix, - - /// Found a float type suffix that is invalid. Only `f32` and `f64` are - /// valid. - InvalidFloatTypeSuffix, - /// Exponent of a float literal does not contain any digits. NoExponentDigits, @@ -309,6 +302,17 @@ pub(crate) enum ParseErrorKind { /// An literal `\r` character not followed by a `\n` character in a /// (raw) string or byte string literal. IsolatedCr, + + /// Literal suffix is not a valid identifier. + InvalidSuffix, + + /// Returned by `Float::parse` if an integer literal (no fractional nor + /// exponent part) is passed. + UnexpectedIntegerLit, + + /// Integer suffixes cannot start with `e` or `E` as this conflicts with the + /// grammar for float literals. + IntegerSuffixStartingWithE, } impl std::error::Error for ParseError {} @@ -324,8 +328,6 @@ impl fmt::Display for ParseError { DoesNotStartWithDigit => "number literal does not start with decimal digit", InvalidDigit => "integer literal contains a digit invalid for its base", NoDigits => "integer literal does not contain any digits", - InvalidIntegerTypeSuffix => "invalid integer type suffix", - InvalidFloatTypeSuffix => "invalid floating point type suffix", NoExponentDigits => "exponent of floating point literal does not contain any digits", UnknownEscape => "unknown escape", UnterminatedEscape => "unterminated escape: input ended too soon", @@ -354,6 +356,9 @@ impl fmt::Display for ParseError { InvalidByteLiteralStart => "invalid start for byte literal", InvalidByteStringLiteralStart => "invalid start for byte string literal", IsolatedCr => r"`\r` not immediately followed by `\n` in string", + InvalidSuffix => "literal suffix is not a valid identifier", + UnexpectedIntegerLit => "expected float literal, but found integer", + IntegerSuffixStartingWithE => "integer literal suffix must not start with 'e' or 'E'", }; description.fmt(f)?; diff --git a/src/escape.rs b/src/escape.rs index 19b63a1..5eb8382 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -1,4 +1,4 @@ -use crate::{ParseError, err::{perr, ParseErrorKind::*}, parse::hex_digit_value}; +use crate::{ParseError, err::{perr, ParseErrorKind::*}, parse::{hex_digit_value, check_suffix}}; /// Must start with `\` @@ -117,14 +117,15 @@ fn is_string_continue_skipable_whitespace(b: u8) -> bool { pub(crate) fn unescape_string( input: &str, offset: usize, -) -> Result, ParseError> { +) -> Result<(Option, usize), ParseError> { + let mut closing_quote_pos = None; let mut i = offset; let mut end_last_escape = offset; let mut value = String::new(); - while i < input.len() - 1 { + while i < input.len() { match input.as_bytes()[i] { // Handle "string continue". - b'\\' if input.as_bytes()[i + 1] == b'\n' => { + b'\\' if input.as_bytes().get(i + 1) == Some(&b'\n') => { value.push_str(&input[end_last_escape..i]); // Find the first non-whitespace character. @@ -143,7 +144,7 @@ pub(crate) fn unescape_string( end_last_escape = i; } b'\r' => { - if input.as_bytes()[i + 1] == b'\n' { + if input.as_bytes().get(i + 1) == Some(&b'\n') { value.push_str(&input[end_last_escape..i]); value.push('\n'); i += 2; @@ -152,16 +153,21 @@ pub(crate) fn unescape_string( return Err(perr(i, IsolatedCr)) } } - b'"' => return Err(perr(i + 1..input.len(), UnexpectedChar)), + b'"' => { + closing_quote_pos = Some(i); + break; + }, b if !E::SUPPORTS_UNICODE && !b.is_ascii() => return Err(perr(i, NonAsciiInByteLiteral)), _ => i += 1, } } - if input.as_bytes()[input.len() - 1] != b'"' || input.len() == offset { - return Err(perr(None, UnterminatedString)); - } + let closing_quote_pos = closing_quote_pos.ok_or(perr(None, UnterminatedString))?; + + let start_suffix = closing_quote_pos + 1; + let suffix = &input[start_suffix..]; + check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?; // `value` is only empty if there was no escape in the input string // (with the special case of the input being empty). This means the @@ -171,11 +177,11 @@ pub(crate) fn unescape_string( } else { // There was an escape in the string, so we need to push the // remaining unescaped part of the string still. - value.push_str(&input[end_last_escape..input.len() - 1]); + value.push_str(&input[end_last_escape..closing_quote_pos]); Some(value) }; - Ok(value) + Ok((value, start_suffix)) } /// Reads and checks a raw (byte) string literal, converting `\r\n` sequences to @@ -185,7 +191,7 @@ pub(crate) fn unescape_string( pub(crate) fn scan_raw_string( input: &str, offset: usize, -) -> Result<(Option, u32), ParseError> { +) -> Result<(Option, u32, usize), ParseError> { // Raw string literal let num_hashes = input[offset..].bytes().position(|b| b != b'#') .ok_or(perr(None, InvalidLiteral))?; @@ -234,12 +240,11 @@ pub(crate) fn scan_raw_string( i += 1; } - let closing_quote_pos = closing_quote_pos - .ok_or(perr(None, UnterminatedRawString))?; + let closing_quote_pos = closing_quote_pos.ok_or(perr(None, UnterminatedRawString))?; - if closing_quote_pos + num_hashes != input.len() - 1 { - return Err(perr(closing_quote_pos + num_hashes + 1..input.len(), UnexpectedChar)); - } + let start_suffix = closing_quote_pos + num_hashes + 1; + let suffix = &input[start_suffix..]; + check_suffix(suffix).map_err(|kind| perr(start_suffix, kind))?; // `value` is only empty if there was no \r\n in the input string (with the // special case of the input being empty). This means the string value @@ -253,5 +258,5 @@ pub(crate) fn scan_raw_string( Some(value) }; - Ok((value, num_hashes as u32)) + Ok((value, num_hashes as u32, start_suffix)) } diff --git a/src/float/mod.rs b/src/float/mod.rs index b196845..08a9e8a 100644 --- a/src/float/mod.rs +++ b/src/float/mod.rs @@ -1,9 +1,9 @@ -use std::fmt; +use std::{fmt, str::FromStr}; use crate::{ Buffer, ParseError, err::{perr, ParseErrorKind::*}, - parse::{end_dec_digits, first_byte_or_empty}, + parse::{end_dec_digits, first_byte_or_empty, check_suffix}, }; @@ -52,21 +52,13 @@ pub struct FloatLit { /// The first index after the whole number part (everything except type suffix). end_number_part: usize, - - /// Optional type suffix. - type_suffix: Option, -} - -/// All possible float type suffixes. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum FloatType { - F32, - F64, } impl FloatLit { /// Parses the input as a floating point literal. Returns an error if the - /// input is invalid or represents a different kind of literal. + /// input is invalid or represents a different kind of literal. Will also + /// reject decimal integer literals like `23` or `17f32`, in accordance + /// with the spec. pub fn parse(s: B) -> Result { match first_byte_or_empty(&s)? { b'0'..=b'9' => { @@ -75,26 +67,19 @@ impl FloatLit { end_integer_part, end_fractional_part, end_number_part, - type_suffix, .. } = parse_impl(&s)?; - Ok(Self { - raw: s, - end_integer_part, - end_fractional_part, - end_number_part, - type_suffix, - }) + Ok(Self { raw: s, end_integer_part, end_fractional_part, end_number_part }) }, _ => Err(perr(0, DoesNotStartWithDigit)), } } - /// Returns the whole number part (including integer part, fractional part - /// and exponent), but without the type suffix. If you want an actual - /// floating point value, you need to parse this string, e.g. with - /// `f32::from_str` or an external crate. + /// Returns the number part (including integer part, fractional part and + /// exponent), but without the suffix. If you want an actual floating + /// point value, you need to parse this string, e.g. with `f32::from_str` + /// or an external crate. pub fn number_part(&self) -> &str { &(*self.raw)[..self.end_number_part] } @@ -121,9 +106,9 @@ impl FloatLit { &(*self.raw)[self.end_fractional_part..self.end_number_part] } - /// The optional type suffix. - pub fn type_suffix(&self) -> Option { - self.type_suffix + /// The optional suffix. Returns `""` if the suffix is empty/does not exist. + pub fn suffix(&self) -> &str { + &(*self.raw)[self.end_number_part..] } /// Returns the raw input that was passed to `parse`. @@ -146,7 +131,6 @@ impl FloatLit<&str> { end_integer_part: self.end_integer_part, end_fractional_part: self.end_fractional_part, end_number_part: self.end_number_part, - type_suffix: self.type_suffix, } } } @@ -184,7 +168,6 @@ pub(crate) fn parse_impl(input: &str) -> Result, ParseError> { return Err(perr(end_integer_part + 1, UnexpectedChar)); } - // Optional exponent. let end_number_part = if rest.starts_with('e') || rest.starts_with('E') { // Strip single - or + sign at the beginning. @@ -207,23 +190,66 @@ pub(crate) fn parse_impl(input: &str) -> Result, ParseError> { end_fractional_part }; + // Make sure the suffix is valid. + let suffix = &input[end_number_part..]; + check_suffix(suffix).map_err(|kind| perr(end_number_part..input.len(), kind))?; - // Type suffix - let type_suffix = match &input[end_number_part..] { - "" => None, - "f32" => Some(FloatType::F32), - "f64" => Some(FloatType::F64), - _ => return Err(perr(end_number_part..input.len(), InvalidFloatTypeSuffix)), - }; + // A float literal needs either a fractional or exponent part, otherwise its + // an integer literal. + if end_integer_part == end_number_part { + return Err(perr(None, UnexpectedIntegerLit)); + } Ok(FloatLit { raw: input, end_integer_part, end_fractional_part, end_number_part, - type_suffix, }) } + +/// All possible float type suffixes. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum FloatType { + F32, + F64, +} + +impl FloatType { + /// Returns the type corresponding to the given suffix (e.g. `"f32"` is + /// mapped to `Self::F32`). If the suffix is not a valid float type, `None` + /// is returned. + pub fn from_suffix(suffix: &str) -> Option { + match suffix { + "f32" => Some(FloatType::F32), + "f64" => Some(FloatType::F64), + _ => None, + } + } + + /// Returns the suffix for this type, e.g. `"f32"` for `Self::F32`. + pub fn suffix(self) -> &'static str { + match self { + Self::F32 => "f32", + Self::F64 => "f64", + } + } +} + +impl FromStr for FloatType { + type Err = (); + fn from_str(s: &str) -> Result { + Self::from_suffix(s).ok_or(()) + } +} + +impl fmt::Display for FloatType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.suffix().fmt(f) + } +} + + #[cfg(test)] mod tests; diff --git a/src/float/tests.rs b/src/float/tests.rs index f15af05..f22443b 100644 --- a/src/float/tests.rs +++ b/src/float/tests.rs @@ -20,18 +20,18 @@ macro_rules! check { end_integer_part: $intpart.len(), end_fractional_part: $intpart.len() + $fracpart.len(), end_number_part: $intpart.len() + $fracpart.len() + $exppart.len(), - type_suffix: check!(@ty $suffix), }; assert_parse_ok_eq( input, FloatLit::parse(input), expected_float.clone(), "FloatLit::parse"); assert_parse_ok_eq( input, Literal::parse(input), Literal::Float(expected_float), "Literal::parse"); + assert_eq!(FloatLit::parse(input).unwrap().suffix(), check!(@ty $suffix)); assert_roundtrip(expected_float.to_owned(), input); }; - (@ty f32) => { Some(FloatType::F32) }; - (@ty f64) => { Some(FloatType::F64) }; - (@ty -) => { None }; + (@ty f32) => { "f32" }; + (@ty f64) => { "f64" }; + (@ty -) => { "" }; (@stringify_suffix -) => { "" }; (@stringify_suffix $suffix:ident) => { stringify!($suffix) }; } @@ -46,42 +46,42 @@ fn manual_without_suffix() -> Result<(), ParseError> { assert_eq!(f.integer_part(), "3"); assert_eq!(f.fractional_part(), Some("14")); assert_eq!(f.exponent_part(), ""); - assert_eq!(f.type_suffix(), None); + assert_eq!(f.suffix(), ""); let f = FloatLit::parse("9.")?; assert_eq!(f.number_part(), "9."); assert_eq!(f.integer_part(), "9"); assert_eq!(f.fractional_part(), Some("")); assert_eq!(f.exponent_part(), ""); - assert_eq!(f.type_suffix(), None); + assert_eq!(f.suffix(), ""); let f = FloatLit::parse("8e1")?; assert_eq!(f.number_part(), "8e1"); assert_eq!(f.integer_part(), "8"); assert_eq!(f.fractional_part(), None); assert_eq!(f.exponent_part(), "e1"); - assert_eq!(f.type_suffix(), None); + assert_eq!(f.suffix(), ""); let f = FloatLit::parse("8E3")?; assert_eq!(f.number_part(), "8E3"); assert_eq!(f.integer_part(), "8"); assert_eq!(f.fractional_part(), None); assert_eq!(f.exponent_part(), "E3"); - assert_eq!(f.type_suffix(), None); + assert_eq!(f.suffix(), ""); let f = FloatLit::parse("8_7_6.1_23e15")?; assert_eq!(f.number_part(), "8_7_6.1_23e15"); assert_eq!(f.integer_part(), "8_7_6"); assert_eq!(f.fractional_part(), Some("1_23")); assert_eq!(f.exponent_part(), "e15"); - assert_eq!(f.type_suffix(), None); + assert_eq!(f.suffix(), ""); let f = FloatLit::parse("8.2e-_04_9")?; assert_eq!(f.number_part(), "8.2e-_04_9"); assert_eq!(f.integer_part(), "8"); assert_eq!(f.fractional_part(), Some("2")); assert_eq!(f.exponent_part(), "e-_04_9"); - assert_eq!(f.type_suffix(), None); + assert_eq!(f.suffix(), ""); Ok(()) } @@ -93,28 +93,28 @@ fn manual_with_suffix() -> Result<(), ParseError> { assert_eq!(f.integer_part(), "3"); assert_eq!(f.fractional_part(), Some("14")); assert_eq!(f.exponent_part(), ""); - assert_eq!(f.type_suffix(), Some(FloatType::F32)); + assert_eq!(FloatType::from_suffix(f.suffix()), Some(FloatType::F32)); let f = FloatLit::parse("8e1f64")?; assert_eq!(f.number_part(), "8e1"); assert_eq!(f.integer_part(), "8"); assert_eq!(f.fractional_part(), None); assert_eq!(f.exponent_part(), "e1"); - assert_eq!(f.type_suffix(), Some(FloatType::F64)); + assert_eq!(FloatType::from_suffix(f.suffix()), Some(FloatType::F64)); let f = FloatLit::parse("8_7_6.1_23e15f32")?; assert_eq!(f.number_part(), "8_7_6.1_23e15"); assert_eq!(f.integer_part(), "8_7_6"); assert_eq!(f.fractional_part(), Some("1_23")); assert_eq!(f.exponent_part(), "e15"); - assert_eq!(f.type_suffix(), Some(FloatType::F32)); + assert_eq!(FloatType::from_suffix(f.suffix()), Some(FloatType::F32)); let f = FloatLit::parse("8.2e-_04_9f64")?; assert_eq!(f.number_part(), "8.2e-_04_9"); assert_eq!(f.integer_part(), "8"); assert_eq!(f.fractional_part(), Some("2")); assert_eq!(f.exponent_part(), "e-_04_9"); - assert_eq!(f.type_suffix(), Some(FloatType::F64)); + assert_eq!(FloatType::from_suffix(f.suffix()), Some(FloatType::F64)); Ok(()) } @@ -125,7 +125,6 @@ fn simple() { check!("3" ".14" "" f32); check!("3" ".14" "" f64); - check!("3" "" "" f32); check!("3" "" "e987654321" -); check!("3" "" "e987654321" f64); @@ -157,6 +156,47 @@ fn simple() { check!("0" ".9182" "E+0" f32); } +#[test] +fn non_standard_suffixes() { + #[track_caller] + fn check_suffix( + input: &str, + integer_part: &str, + fractional_part: Option<&str>, + exponent_part: &str, + suffix: &str, + ) { + let lit = FloatLit::parse(input) + .unwrap_or_else(|e| panic!("expected to parse '{}' but got {}", input, e)); + assert_eq!(lit.integer_part(), integer_part); + assert_eq!(lit.fractional_part(), fractional_part); + assert_eq!(lit.exponent_part(), exponent_part); + assert_eq!(lit.suffix(), suffix); + + let lit = match Literal::parse(input) { + Ok(Literal::Float(f)) => f, + other => panic!("Expected float literal, but got {:?} for '{}'", other, input), + }; + assert_eq!(lit.integer_part(), integer_part); + assert_eq!(lit.fractional_part(), fractional_part); + assert_eq!(lit.exponent_part(), exponent_part); + assert_eq!(lit.suffix(), suffix); + } + + check_suffix("7.1f23", "7", Some("1"), "", "f23"); + check_suffix("7.1f320", "7", Some("1"), "", "f320"); + check_suffix("7.1f64_", "7", Some("1"), "", "f64_"); + check_suffix("8.1f649", "8", Some("1"), "", "f649"); + check_suffix("8.1f64f32", "8", Some("1"), "", "f64f32"); + check_suffix("23e2_banana", "23", None, "e2_", "banana"); + check_suffix("23.2_banana", "23", Some("2_"), "", "banana"); + check_suffix("23e2pe55ter", "23", None, "e2", "pe55ter"); + check_suffix("23e2p_e55ter", "23", None, "e2", "p_e55ter"); + check_suffix("3.15Jürgen", "3", Some("15"), "", "Jürgen"); + check_suffix("3e2e5", "3", None, "e2", "e5"); + check_suffix("3e2e5f", "3", None, "e2", "e5f"); +} + #[test] fn parse_err() { assert_err!(FloatLit, "", Empty, None); @@ -176,10 +216,11 @@ fn parse_err() { assert_err_single!(FloatLit::parse("_2.7"), DoesNotStartWithDigit, 0); assert_err_single!(FloatLit::parse(".5"), DoesNotStartWithDigit, 0); - assert_err_single!(FloatLit::parse("0x44.5"), InvalidFloatTypeSuffix, 1..6); assert_err!(FloatLit, "1e", NoExponentDigits, 1..2); assert_err!(FloatLit, "1.e4", UnexpectedChar, 2); assert_err!(FloatLit, "3._4", UnexpectedChar, 2); + assert_err!(FloatLit, "3.f32", UnexpectedChar, 2); + assert_err!(FloatLit, "3.e5", UnexpectedChar, 2); assert_err!(FloatLit, "12345._987", UnexpectedChar, 6); assert_err!(FloatLit, "46._", UnexpectedChar, 3); assert_err!(FloatLit, "46.f32", UnexpectedChar, 3); @@ -188,19 +229,25 @@ fn parse_err() { assert_err!(FloatLit, "46.e3f64", UnexpectedChar, 3); assert_err!(FloatLit, "23.4e_", NoExponentDigits, 4..6); assert_err!(FloatLit, "23E___f32", NoExponentDigits, 2..6); - assert_err!(FloatLit, "7f23", InvalidFloatTypeSuffix, 1..4); - assert_err!(FloatLit, "7f320", InvalidFloatTypeSuffix, 1..5); - assert_err!(FloatLit, "7f64_", InvalidFloatTypeSuffix, 1..5); - assert_err!(FloatLit, "8f649", InvalidFloatTypeSuffix, 1..5); - assert_err!(FloatLit, "8f64f32", InvalidFloatTypeSuffix, 1..7); - assert_err!(FloatLit, "55e3.1", InvalidFloatTypeSuffix, 4..6); // suboptimal - - assert_err!(FloatLit, "3.7+", InvalidFloatTypeSuffix, 3..4); - assert_err!(FloatLit, "3.7+2", InvalidFloatTypeSuffix, 3..5); - assert_err!(FloatLit, "3.7-", InvalidFloatTypeSuffix, 3..4); - assert_err!(FloatLit, "3.7-2", InvalidFloatTypeSuffix, 3..5); + assert_err!(FloatLit, "55e3.1", UnexpectedChar, 4..6); + + assert_err!(FloatLit, "3.7+", UnexpectedChar, 3..4); + assert_err!(FloatLit, "3.7+2", UnexpectedChar, 3..5); + assert_err!(FloatLit, "3.7-", UnexpectedChar, 3..4); + assert_err!(FloatLit, "3.7-2", UnexpectedChar, 3..5); assert_err!(FloatLit, "3.7e+", NoExponentDigits, 3..5); assert_err!(FloatLit, "3.7e-", NoExponentDigits, 3..5); - assert_err!(FloatLit, "3.7e-+3", NoExponentDigits, 3..5); // suboptimal - assert_err!(FloatLit, "3.7e+-3", NoExponentDigits, 3..5); // suboptimal + assert_err!(FloatLit, "3.7e-+3", NoExponentDigits, 3..5); // suboptimal error + assert_err!(FloatLit, "3.7e+-3", NoExponentDigits, 3..5); // suboptimal error + assert_err_single!(FloatLit::parse("0x44.5"), InvalidSuffix, 1..6); + + assert_err_single!(FloatLit::parse("3"), UnexpectedIntegerLit, None); + assert_err_single!(FloatLit::parse("35_389"), UnexpectedIntegerLit, None); + assert_err_single!(FloatLit::parse("9_8_7f32"), UnexpectedIntegerLit, None); + assert_err_single!(FloatLit::parse("9_8_7banana"), UnexpectedIntegerLit, None); + assert_err_single!(FloatLit::parse("7f23"), UnexpectedIntegerLit, None); + assert_err_single!(FloatLit::parse("7f320"), UnexpectedIntegerLit, None); + assert_err_single!(FloatLit::parse("7f64_"), UnexpectedIntegerLit, None); + assert_err_single!(FloatLit::parse("8f649"), UnexpectedIntegerLit, None); + assert_err_single!(FloatLit::parse("8f64f32"), UnexpectedIntegerLit, None); } diff --git a/src/integer/mod.rs b/src/integer/mod.rs index 79f7e55..0b689a5 100644 --- a/src/integer/mod.rs +++ b/src/integer/mod.rs @@ -1,9 +1,9 @@ -use std::fmt; +use std::{fmt, str::FromStr}; use crate::{ Buffer, ParseError, err::{perr, ParseErrorKind::*}, - parse::{first_byte_or_empty, hex_digit_value}, + parse::{first_byte_or_empty, hex_digit_value, check_suffix}, }; @@ -25,52 +25,14 @@ use crate::{ #[derive(Debug, Clone, Copy, PartialEq, Eq)] #[non_exhaustive] pub struct IntegerLit { + /// The raw literal. Grammar: `
`. raw: B, - // First index of the main number part (after the base prefix). + /// First index of the main number part (after the base prefix). start_main_part: usize, - // First index not part of the main number part. + /// First index not part of the main number part. end_main_part: usize, + /// Parsed `raw[..start_main_part]`. base: IntegerBase, - type_suffix: Option, -} - -/// The bases in which an integer can be specified. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum IntegerBase { - Binary, - Octal, - Decimal, - Hexadecimal, -} - -/// All possible integer type suffixes. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum IntegerType { - U8, - U16, - U32, - U64, - U128, - Usize, - I8, - I16, - I32, - I64, - I128, - Isize, -} - -impl IntegerBase { - /// Returns the literal prefix that indicates this base, i.e. `"0b"`, - /// `"0o"`, `""` and `"0x"`. - pub fn prefix(self) -> &'static str { - match self { - Self::Binary => "0b", - Self::Octal => "0o", - Self::Decimal => "", - Self::Hexadecimal => "0x", - } - } } impl IntegerLit { @@ -84,17 +46,10 @@ impl IntegerLit { start_main_part, end_main_part, base, - type_suffix, .. } = parse_impl(&input, digit)?; - Ok(Self { - raw: input, - start_main_part, - end_main_part, - base, - type_suffix, - }) + Ok(Self { raw: input, start_main_part, end_main_part, base }) }, _ => Err(perr(0, DoesNotStartWithDigit)), } @@ -106,12 +61,7 @@ impl IntegerLit { /// /// Returns `None` if the literal overflows `N`. pub fn value(&self) -> Option { - let base = match self.base { - IntegerBase::Binary => N::from_small_number(2), - IntegerBase::Octal => N::from_small_number(8), - IntegerBase::Decimal => N::from_small_number(10), - IntegerBase::Hexadecimal => N::from_small_number(16), - }; + let base = N::from_small_number(self.base.value()); let mut acc = N::from_small_number(0); for digit in self.raw_main_part().bytes() { @@ -142,9 +92,11 @@ impl IntegerLit { &(*self.raw)[self.start_main_part..self.end_main_part] } - /// The type suffix, if specified. - pub fn type_suffix(&self) -> Option { - self.type_suffix + /// The optional suffix. Returns `""` if the suffix is empty/does not exist. + /// + /// If you want the type, try `IntegerType::from_suffix(lit.suffix())`. + pub fn suffix(&self) -> &str { + &(*self.raw)[self.end_main_part..] } /// Returns the raw input that was passed to `parse`. @@ -167,7 +119,6 @@ impl IntegerLit<&str> { start_main_part: self.start_main_part, end_main_part: self.end_main_part, base: self.base, - type_suffix: self.type_suffix, } } } @@ -248,59 +199,150 @@ pub(crate) fn parse_impl(input: &str, first: u8) -> Result, Par }; let without_prefix = &input[end_prefix..]; - // Find end of main part. - let end_main = without_prefix.bytes() - .position(|b| !matches!(b, b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F' | b'_')) - .unwrap_or(without_prefix.len()); - let (main_part, type_suffix) = without_prefix.split_at(end_main); - - // Check for invalid digits and make sure there is at least one valid digit. - let invalid_digit_pos = match base { - IntegerBase::Binary => main_part.bytes() - .position(|b| !matches!(b, b'0' | b'1' | b'_')), - IntegerBase::Octal => main_part.bytes() - .position(|b| !matches!(b, b'0'..=b'7' | b'_')), - IntegerBase::Decimal => main_part.bytes() - .position(|b| !matches!(b, b'0'..=b'9' | b'_')), - IntegerBase::Hexadecimal => None, - }; - if let Some(pos) = invalid_digit_pos { - return Err(perr(end_prefix + pos, InvalidDigit)); + // Scan input to find the first character that's not a valid digit. + let is_valid_digit = match base { + IntegerBase::Binary => |b| matches!(b, b'0' | b'1' | b'_'), + IntegerBase::Octal => |b| matches!(b, b'0'..=b'7' | b'_'), + IntegerBase::Decimal => |b| matches!(b, b'0'..=b'9' | b'_'), + IntegerBase::Hexadecimal => |b| matches!(b, b'0'..=b'9' | b'a'..=b'f' | b'A'..=b'F' | b'_'), + }; + let end_main = without_prefix.bytes() + .position(|b| !is_valid_digit(b)) + .unwrap_or(without_prefix.len()); + let (main_part, suffix) = without_prefix.split_at(end_main); + + check_suffix(suffix).map_err(|kind| { + // This is just to have a nicer error kind for this special case. If the + // suffix is invalid, it is non-empty -> unwrap ok. + let first = suffix.as_bytes()[0]; + if !is_valid_digit(first) && first.is_ascii_digit() { + perr(end_main + end_prefix, InvalidDigit) + } else { + perr(end_main + end_prefix..input.len(), kind) + } + })?; + if suffix.starts_with('e') || suffix.starts_with('E') { + return Err(perr(end_main, IntegerSuffixStartingWithE)); } + // Make sure main number part is not empty. if main_part.bytes().filter(|&b| b != b'_').count() == 0 { return Err(perr(end_prefix..end_prefix + end_main, NoDigits)); } - - // Parse type suffix - let type_suffix = match type_suffix { - "" => None, - "u8" => Some(IntegerType::U8), - "u16" => Some(IntegerType::U16), - "u32" => Some(IntegerType::U32), - "u64" => Some(IntegerType::U64), - "u128" => Some(IntegerType::U128), - "usize" => Some(IntegerType::Usize), - "i8" => Some(IntegerType::I8), - "i16" => Some(IntegerType::I16), - "i32" => Some(IntegerType::I32), - "i64" => Some(IntegerType::I64), - "i128" => Some(IntegerType::I128), - "isize" => Some(IntegerType::Isize), - _ => return Err(perr(end_main + end_prefix..input.len(), InvalidIntegerTypeSuffix)), - }; - Ok(IntegerLit { raw: input, start_main_part: end_prefix, end_main_part: end_main + end_prefix, base, - type_suffix, }) } +/// The bases in which an integer can be specified. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum IntegerBase { + Binary, + Octal, + Decimal, + Hexadecimal, +} + +impl IntegerBase { + /// Returns the literal prefix that indicates this base, i.e. `"0b"`, + /// `"0o"`, `""` and `"0x"`. + pub fn prefix(self) -> &'static str { + match self { + Self::Binary => "0b", + Self::Octal => "0o", + Self::Decimal => "", + Self::Hexadecimal => "0x", + } + } + + /// Returns the base value, i.e. 2, 8, 10 or 16. + pub fn value(self) -> u8 { + match self { + Self::Binary => 2, + Self::Octal => 8, + Self::Decimal => 10, + Self::Hexadecimal => 16, + } + } +} + +/// All possible integer type suffixes. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum IntegerType { + U8, + U16, + U32, + U64, + U128, + Usize, + I8, + I16, + I32, + I64, + I128, + Isize, +} + +impl IntegerType { + /// Returns the type corresponding to the given suffix (e.g. `"u8"` is + /// mapped to `Self::U8`). If the suffix is not a valid integer type, + /// `None` is returned. + pub fn from_suffix(suffix: &str) -> Option { + match suffix { + "u8" => Some(Self::U8), + "u16" => Some(Self::U16), + "u32" => Some(Self::U32), + "u64" => Some(Self::U64), + "u128" => Some(Self::U128), + "usize" => Some(Self::Usize), + "i8" => Some(Self::I8), + "i16" => Some(Self::I16), + "i32" => Some(Self::I32), + "i64" => Some(Self::I64), + "i128" => Some(Self::I128), + "isize" => Some(Self::Isize), + _ => None, + } + } + + /// Returns the suffix for this type, e.g. `"u8"` for `Self::U8`. + pub fn suffix(self) -> &'static str { + match self { + Self::U8 => "u8", + Self::U16 => "u16", + Self::U32 => "u32", + Self::U64 => "u64", + Self::U128 => "u128", + Self::Usize => "usize", + Self::I8 => "i8", + Self::I16 => "i16", + Self::I32 => "i32", + Self::I64 => "i64", + Self::I128 => "i128", + Self::Isize => "isize", + } + } +} + +impl FromStr for IntegerType { + type Err = (); + fn from_str(s: &str) -> Result { + Self::from_suffix(s).ok_or(()) + } +} + +impl fmt::Display for IntegerType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.suffix().fmt(f) + } +} + + #[cfg(test)] mod tests; diff --git a/src/integer/tests.rs b/src/integer/tests.rs index 1656345..e6dad3f 100644 --- a/src/integer/tests.rs +++ b/src/integer/tests.rs @@ -20,13 +20,13 @@ fn check( start_main_part: base.prefix().len(), end_main_part: base.prefix().len() + main_part.len(), base, - type_suffix }; assert_parse_ok_eq( input, IntegerLit::parse(input), expected_integer.clone(), "IntegerLit::parse"); assert_parse_ok_eq( input, Literal::parse(input), Literal::Integer(expected_integer), "Literal::parse"); assert_roundtrip(expected_integer.to_owned(), input); + assert_eq!(Ty::from_suffix(IntegerLit::parse(input).unwrap().suffix()), type_suffix); let actual_value = IntegerLit::parse(input) .unwrap() @@ -101,7 +101,7 @@ fn parse_binary() { check("0b10010u8", 0b10010u8, Binary, "10010", Some(Ty::U8)); check("0b10010i8", 0b10010u8, Binary, "10010", Some(Ty::I8)); check("0b10010u64", 0b10010u64, Binary, "10010", Some(Ty::U64)); - check("0b10010i64", 0b10010u64, Binary, "10010", Some(Ty::I64)); + check("0b10010i64", 0b10010i64, Binary, "10010", Some(Ty::I64)); check( "0b1011001_00110000_00101000_10100101u32", 0b1011001_00110000_00101000_10100101u32, @@ -197,7 +197,7 @@ fn suffixes() { ("123u64", Ty::U64), ("123u128", Ty::U128), ].iter().for_each(|&(s, ty)| { - assert_eq!(IntegerLit::parse(s).unwrap().type_suffix(), Some(ty)); + assert_eq!(Ty::from_suffix(IntegerLit::parse(s).unwrap().suffix()), Some(ty)); }); } @@ -249,17 +249,15 @@ fn parse_err() { assert_err!(IntegerLit, "", Empty, None); assert_err_single!(IntegerLit::parse("a"), DoesNotStartWithDigit, 0); assert_err_single!(IntegerLit::parse(";"), DoesNotStartWithDigit, 0); - assert_err_single!(IntegerLit::parse("0;"), InvalidIntegerTypeSuffix, 1..2); - assert_err_single!(IntegerLit::parse("0a"), InvalidDigit, 1); + assert_err_single!(IntegerLit::parse("0;"), UnexpectedChar, 1..2); assert_err!(IntegerLit, "0b", NoDigits, 2..2); - assert_err_single!(IntegerLit::parse("0z"), InvalidIntegerTypeSuffix, 1..2); assert_err_single!(IntegerLit::parse(" 0"), DoesNotStartWithDigit, 0); - assert_err_single!(IntegerLit::parse("0 "), InvalidIntegerTypeSuffix, 1); - assert_err_single!(IntegerLit::parse("0a3"), InvalidDigit, 1); + assert_err_single!(IntegerLit::parse("0 "), UnexpectedChar, 1); assert_err!(IntegerLit, "0b3", InvalidDigit, 2); - assert_err_single!(IntegerLit::parse("0z3"), InvalidIntegerTypeSuffix, 1..3); assert_err_single!(IntegerLit::parse("_"), DoesNotStartWithDigit, 0); assert_err_single!(IntegerLit::parse("_3"), DoesNotStartWithDigit, 0); + assert_err!(IntegerLit, "0x44.5", UnexpectedChar, 4..6); + assert_err_single!(IntegerLit::parse("123em"), IntegerSuffixStartingWithE, 3); } #[test] @@ -267,30 +265,12 @@ fn invalid_digits() { assert_err!(IntegerLit, "0b10201", InvalidDigit, 4); assert_err!(IntegerLit, "0b9", InvalidDigit, 2); assert_err!(IntegerLit, "0b07", InvalidDigit, 3); - assert_err!(IntegerLit, "0b0a", InvalidDigit, 3); - assert_err!(IntegerLit, "0b0A", InvalidDigit, 3); - assert_err!(IntegerLit, "0b01f", InvalidDigit, 4); - assert_err!(IntegerLit, "0b01F", InvalidDigit, 4); assert_err!(IntegerLit, "0o12380", InvalidDigit, 5); assert_err!(IntegerLit, "0o192", InvalidDigit, 3); - assert_err!(IntegerLit, "0o7a_", InvalidDigit, 3); - assert_err!(IntegerLit, "0o7A_", InvalidDigit, 3); - assert_err!(IntegerLit, "0o72f_0", InvalidDigit, 4); - assert_err!(IntegerLit, "0o72F_0", InvalidDigit, 4); - - assert_err_single!(IntegerLit::parse("12a3"), InvalidDigit, 2); - assert_err_single!(IntegerLit::parse("12f3"), InvalidDigit, 2); - assert_err_single!(IntegerLit::parse("12f_"), InvalidDigit, 2); - assert_err_single!(IntegerLit::parse("12F_"), InvalidDigit, 2); + assert_err_single!(IntegerLit::parse("a_123"), DoesNotStartWithDigit, 0); assert_err_single!(IntegerLit::parse("B_123"), DoesNotStartWithDigit, 0); - - assert_err!(IntegerLit, "0x8cg", InvalidIntegerTypeSuffix, 4..5); - assert_err!(IntegerLit, "0x8cG", InvalidIntegerTypeSuffix, 4..5); - assert_err!(IntegerLit, "0x8c1h_", InvalidIntegerTypeSuffix, 5..7); - assert_err!(IntegerLit, "0x8c1H_", InvalidIntegerTypeSuffix, 5..7); - assert_err!(IntegerLit, "0x8czu16", InvalidIntegerTypeSuffix, 4..8); } #[test] @@ -317,27 +297,61 @@ fn no_valid_digits() { } #[test] -fn invalid_suffix() { - assert_err!(IntegerLit, "5u7", InvalidIntegerTypeSuffix, 1..3); - assert_err!(IntegerLit, "5u9", InvalidIntegerTypeSuffix, 1..3); - assert_err!(IntegerLit, "5u0", InvalidIntegerTypeSuffix, 1..3); - assert_err!(IntegerLit, "33u12", InvalidIntegerTypeSuffix, 2..5); - assert_err!(IntegerLit, "84u17", InvalidIntegerTypeSuffix, 2..5); - assert_err!(IntegerLit, "99u80", InvalidIntegerTypeSuffix, 2..5); - assert_err!(IntegerLit, "1234uu16", InvalidIntegerTypeSuffix, 4..8); - - assert_err!(IntegerLit, "5i7", InvalidIntegerTypeSuffix, 1..3); - assert_err!(IntegerLit, "5i9", InvalidIntegerTypeSuffix, 1..3); - assert_err!(IntegerLit, "5i0", InvalidIntegerTypeSuffix, 1..3); - assert_err!(IntegerLit, "33i12", InvalidIntegerTypeSuffix, 2..5); - assert_err!(IntegerLit, "84i17", InvalidIntegerTypeSuffix, 2..5); - assert_err!(IntegerLit, "99i80", InvalidIntegerTypeSuffix, 2..5); - assert_err!(IntegerLit, "1234ii16", InvalidIntegerTypeSuffix, 4..8); - - assert_err!(IntegerLit, "0ui32", InvalidIntegerTypeSuffix, 1..5); - assert_err!(IntegerLit, "1iu32", InvalidIntegerTypeSuffix, 1..5); - assert_err_single!(IntegerLit::parse("54321a64"), InvalidDigit, 5); - assert_err!(IntegerLit, "54321b64", InvalidDigit, 5); - assert_err!(IntegerLit, "54321x64", InvalidIntegerTypeSuffix, 5..8); - assert_err!(IntegerLit, "54321o64", InvalidIntegerTypeSuffix, 5..8); +fn non_standard_suffixes() { + #[track_caller] + fn check_suffix( + input: &str, + value: T, + base: IntegerBase, + main_part: &str, + suffix: &str, + ) { + check(input, value, base, main_part, None); + assert_eq!(IntegerLit::parse(input).unwrap().suffix(), suffix); + } + + check_suffix("5u7", 5, Decimal, "5", "u7"); + check_suffix("5u7", 5, Decimal, "5", "u7"); + check_suffix("5u9", 5, Decimal, "5", "u9"); + check_suffix("5u0", 5, Decimal, "5", "u0"); + check_suffix("33u12", 33, Decimal, "33", "u12"); + check_suffix("84u17", 84, Decimal, "84", "u17"); + check_suffix("99u80", 99, Decimal, "99", "u80"); + check_suffix("1234uu16", 1234, Decimal, "1234", "uu16"); + + check_suffix("5i7", 5, Decimal, "5", "i7"); + check_suffix("5i9", 5, Decimal, "5", "i9"); + check_suffix("5i0", 5, Decimal, "5", "i0"); + check_suffix("33i12", 33, Decimal, "33", "i12"); + check_suffix("84i17", 84, Decimal, "84", "i17"); + check_suffix("99i80", 99, Decimal, "99", "i80"); + check_suffix("1234ii16", 1234, Decimal, "1234", "ii16"); + + check_suffix("0ui32", 0, Decimal, "0", "ui32"); + check_suffix("1iu32", 1, Decimal, "1", "iu32"); + check_suffix("54321a64", 54321, Decimal, "54321", "a64"); + check_suffix("54321b64", 54321, Decimal, "54321", "b64"); + check_suffix("54321x64", 54321, Decimal, "54321", "x64"); + check_suffix("54321o64", 54321, Decimal, "54321", "o64"); + + check_suffix("0a", 0, Decimal, "0", "a"); + check_suffix("0a3", 0, Decimal, "0", "a3"); + check_suffix("0z", 0, Decimal, "0", "z"); + check_suffix("0z3", 0, Decimal, "0", "z3"); + check_suffix("0b0a", 0, Binary, "0", "a"); + check_suffix("0b0A", 0, Binary, "0", "A"); + check_suffix("0b01f", 1, Binary, "01", "f"); + check_suffix("0b01F", 1, Binary, "01", "F"); + check_suffix("0o7a_", 7, Octal, "7", "a_"); + check_suffix("0o7A_", 7, Octal, "7", "A_"); + check_suffix("0o72f_0", 0o72, Octal, "72", "f_0"); + check_suffix("0o72F_0", 0o72, Octal, "72", "F_0"); + + check_suffix("0x8cg", 0x8c, Hexadecimal, "8c", "g"); + check_suffix("0x8cG", 0x8c, Hexadecimal, "8c", "G"); + check_suffix("0x8c1h_", 0x8c1, Hexadecimal, "8c1", "h_"); + check_suffix("0x8c1H_", 0x8c1, Hexadecimal, "8c1", "H_"); + check_suffix("0x8czu16", 0x8c, Hexadecimal, "8c", "zu16"); + + check_suffix("123_foo", 123, Decimal, "123_", "foo"); } diff --git a/src/lib.rs b/src/lib.rs index bd81f56..bdde6b7 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -82,7 +82,7 @@ //! // Parse a specific kind of literal (float in this case): //! let float_lit = FloatLit::parse("3.14f32"); //! assert!(float_lit.is_ok()); -//! assert_eq!(float_lit.unwrap().type_suffix(), Some(litrs::FloatType::F32)); +//! assert_eq!(float_lit.unwrap().suffix(), "f32"); //! assert!(FloatLit::parse("'c'").is_err()); //! //! // Parse any kind of literal. After parsing, you can inspect the literal @@ -105,6 +105,11 @@ //! //! - `proc-macro2` (**default**): adds the dependency `proc_macro2`, a bunch of //! `From` and `TryFrom` impls, and [`InvalidToken::to_compile_error2`]. +//! - `check_suffix`: if enabled, `parse` functions will exactly verify that the +//! literal suffix is valid. Adds the dependency `unicode-xid`. If disabled, +//! only an approximate check (only in ASCII range) is done. If you are +//! writing a proc macro, you don't need to enable this as the suffix is +//! already checked by the compiler. //! //! //! [ref]: https://doc.rust-lang.org/reference/tokens.html#literals @@ -179,6 +184,62 @@ pub enum Literal { ByteString(ByteStringLit), } +impl Literal { + /// Parses the given input as a Rust literal. + pub fn parse(input: B) -> Result { + parse::parse(input) + } + + /// Returns the suffix of this literal or `""` if it doesn't have one. + /// + /// Rust token grammar actually allows suffixes for all kinds of tokens. + /// Most Rust programmer only know the type suffixes for integer and + /// floats, e.g. `0u32`. And in normal Rust code, everything else causes an + /// error. But it is possible to pass literals with arbitrary suffixes to + /// proc macros, for example: + /// + /// ```ignore + /// some_macro!(3.14f33 16px '🦊'good_boy "toph"beifong); + /// ``` + /// + /// Boolean literals, not actually being literals, but idents, cannot have + /// suffixes and this method always returns `""` for those. + /// + /// There are some edge cases to be aware of: + /// - Integer suffixes must not start with `e` or `E` as that conflicts with + /// the exponent grammar for floats. `0e1` is a float; `0eel` is also + /// parsed as a float and results in an error. + /// - Hexadecimal integers eagerly parse digits, so `0x5abcdefgh` has a + /// suffix von `gh`. + /// - Suffixes can contain and start with `_`, but for integer and number + /// literals, `_` is eagerly parsed as part of the number, so `1_x` has + /// the suffix `x`. + /// - The input `55f32` is regarded as integer literal with suffix `f32`. + /// + /// # Example + /// + /// ``` + /// use litrs::Literal; + /// + /// assert_eq!(Literal::parse(r##"3.14f33"##).unwrap().suffix(), "f33"); + /// assert_eq!(Literal::parse(r##"123hackerman"##).unwrap().suffix(), "hackerman"); + /// assert_eq!(Literal::parse(r##"0x0fuck"##).unwrap().suffix(), "uck"); + /// assert_eq!(Literal::parse(r##"'🦊'good_boy"##).unwrap().suffix(), "good_boy"); + /// assert_eq!(Literal::parse(r##""toph"beifong"##).unwrap().suffix(), "beifong"); + /// ``` + pub fn suffix(&self) -> &str { + match self { + Literal::Bool(_) => "", + Literal::Integer(l) => l.suffix(), + Literal::Float(l) => l.suffix(), + Literal::Char(l) => l.suffix(), + Literal::String(l) => l.suffix(), + Literal::Byte(l) => l.suffix(), + Literal::ByteString(l) => l.suffix(), + } + } +} + impl Literal<&str> { /// Makes a copy of the underlying buffer and returns the owned version of /// `Self`. diff --git a/src/parse.rs b/src/parse.rs index a0266da..efc6b87 100644 --- a/src/parse.rs +++ b/src/parse.rs @@ -9,52 +9,44 @@ use crate::{ IntegerLit, Literal, StringLit, - err::{perr, ParseErrorKind::*}, + err::{perr, ParseErrorKind::{*, self}}, }; -impl Literal { - /// Parses the given input as a Rust literal. - pub fn parse(input: B) -> Result { - let (first, rest) = input.as_bytes().split_first().ok_or(perr(None, Empty))?; - let second = input.as_bytes().get(1).copied(); - - match first { - b'f' if &*input == "false" => Ok(Self::Bool(BoolLit::False)), - b't' if &*input == "true" => Ok(Self::Bool(BoolLit::True)), - - // A number literal (integer or float). - b'0'..=b'9' => { - // To figure out whether this is a float or integer, we do some - // quick inspection here. Yes, this is technically duplicate - // work with what is happening in the integer/float parse - // methods, but it makes the code way easier for now and won't - // be a huge performance loss. - let end = 1 + end_dec_digits(rest); - match input.as_bytes().get(end) { - // Potential chars in integer literals: b, o, x for base; u - // and i for type suffix. - None | Some(b'b') | Some(b'o') | Some(b'x') | Some(b'u') | Some(b'i') - => IntegerLit::parse(input).map(Literal::Integer), - - // Potential chars for float literals: `.` as fractional - // period, e and E as exponent start and f as type suffix. - Some(b'.') | Some(b'e') | Some(b'E') | Some(b'f') - => FloatLit::parse(input).map(Literal::Float), - - _ => Err(perr(end, UnexpectedChar)), - } - }, - - b'\'' => CharLit::parse(input).map(Literal::Char), - b'"' | b'r' => StringLit::parse_impl(input).map(Literal::String), - - b'b' if second == Some(b'\'') => ByteLit::parse(input).map(Literal::Byte), - b'b' if second == Some(b'r') || second == Some(b'"') - => ByteStringLit::parse_impl(input).map(Literal::ByteString), - - _ => Err(perr(None, InvalidLiteral)), - } +pub fn parse(input: B) -> Result, ParseError> { + let (first, rest) = input.as_bytes().split_first().ok_or(perr(None, Empty))?; + let second = input.as_bytes().get(1).copied(); + + match first { + b'f' if &*input == "false" => Ok(Literal::Bool(BoolLit::False)), + b't' if &*input == "true" => Ok(Literal::Bool(BoolLit::True)), + + // A number literal (integer or float). + b'0'..=b'9' => { + // To figure out whether this is a float or integer, we do some + // quick inspection here. Yes, this is technically duplicate + // work with what is happening in the integer/float parse + // methods, but it makes the code way easier for now and won't + // be a huge performance loss. + // + // The first non-decimal char in a float literal must + // be '.', 'e' or 'E'. + match input.as_bytes().get(1 + end_dec_digits(rest)) { + Some(b'.') | Some(b'e') | Some(b'E') + => FloatLit::parse(input).map(Literal::Float), + + _ => IntegerLit::parse(input).map(Literal::Integer), + } + }, + + b'\'' => CharLit::parse(input).map(Literal::Char), + b'"' | b'r' => StringLit::parse(input).map(Literal::String), + + b'b' if second == Some(b'\'') => ByteLit::parse(input).map(Literal::Byte), + b'b' if second == Some(b'r') || second == Some(b'"') + => ByteStringLit::parse(input).map(Literal::ByteString), + + _ => Err(perr(None, InvalidLiteral)), } } @@ -79,3 +71,55 @@ pub(crate) fn hex_digit_value(digit: u8) -> Option { _ => None, } } + +/// Makes sure that `s` is a valid literal suffix. +pub(crate) fn check_suffix(s: &str) -> Result<(), ParseErrorKind> { + if s.is_empty() { + return Ok(()); + } + + let mut chars = s.chars(); + let first = chars.next().unwrap(); + let rest = chars.as_str(); + if first == '_' && rest.is_empty() { + return Err(InvalidSuffix); + } + + // This is just an extra check to improve the error message. If the first + // character of the "suffix" is already some invalid ASCII + // char, "unexpected character" seems like the more fitting error. + if first.is_ascii() && !(first.is_ascii_alphabetic() || first == '_') { + return Err(UnexpectedChar); + } + + // Proper check is optional as it's not really necessary in proc macro + // context. + #[cfg(feature = "check_suffix")] + fn is_valid_suffix(first: char, rest: &str) -> bool { + use unicode_xid::UnicodeXID; + + (first == '_' || first.is_xid_start()) + && rest.chars().all(|c| c.is_xid_continue()) + } + + // When avoiding the dependency on `unicode_xid`, we just do a best effort + // to catch the most common errors. + #[cfg(not(feature = "check_suffix"))] + fn is_valid_suffix(first: char, rest: &str) -> bool { + if first.is_ascii() && !(first.is_ascii_alphabetic() || first == '_') { + return false; + } + for c in rest.chars() { + if c.is_ascii() && !(c.is_ascii_alphanumeric() || c == '_') { + return false; + } + } + true + } + + if is_valid_suffix(first, rest) { + Ok(()) + } else { + Err(InvalidSuffix) + } +} diff --git a/src/string/mod.rs b/src/string/mod.rs index ab1cc3f..d2034a6 100644 --- a/src/string/mod.rs +++ b/src/string/mod.rs @@ -18,13 +18,16 @@ pub struct StringLit { /// The raw input. raw: B, - /// The string value (with all escaped unescaped), or `None` if there were - /// no escapes. In the latter case, `input` is the string value. + /// The string value (with all escapes unescaped), or `None` if there were + /// no escapes. In the latter case, the string value is in `raw`. value: Option, /// The number of hash signs in case of a raw string literal, or `None` if /// it's not a raw string literal. num_hashes: Option, + + /// Start index of the suffix or `raw.len()` if there is no suffix. + start_suffix: usize, } impl StringLit { @@ -32,7 +35,10 @@ impl StringLit { /// input is invalid or represents a different kind of literal. pub fn parse(input: B) -> Result { match first_byte_or_empty(&input)? { - b'r' | b'"' => Self::parse_impl(input), + b'r' | b'"' => { + let (value, num_hashes, start_suffix) = parse_impl(&input)?; + Ok(Self { raw: input, value, num_hashes, start_suffix }) + } _ => Err(perr(0, InvalidStringLiteralStart)), } } @@ -53,6 +59,11 @@ impl StringLit { value.map(B::Cow::from).unwrap_or_else(|| raw.cut(inner_range).into_cow()) } + /// The optional suffix. Returns `""` if the suffix is empty/does not exist. + pub fn suffix(&self) -> &str { + &(*self.raw)[self.start_suffix..] + } + /// Returns whether this literal is a raw string literal (starting with /// `r`). pub fn is_raw_string(&self) -> bool { @@ -72,27 +83,8 @@ impl StringLit { /// The range within `self.raw` that excludes the quotes and potential `r#`. fn inner_range(&self) -> Range { match self.num_hashes { - None => 1..self.raw.len() - 1, - Some(n) => 1 + n as usize + 1..self.raw.len() - n as usize - 1, - } - } - - /// Precondition: input has to start with either `"` or `r`. - pub(crate) fn parse_impl(input: B) -> Result { - if input.starts_with('r') { - let (value, num_hashes) = scan_raw_string::(&input, 1)?; - Ok(Self { - raw: input, - value, - num_hashes: Some(num_hashes), - }) - } else { - let value = unescape_string::(&input, 1)?; - Ok(Self { - raw: input, - value, - num_hashes: None, - }) + None => 1..self.start_suffix - 1, + Some(n) => 1 + n as usize + 1..self.start_suffix - n as usize - 1, } } } @@ -105,6 +97,7 @@ impl StringLit<&str> { raw: self.raw.to_owned(), value: self.value, num_hashes: self.num_hashes, + start_suffix: self.start_suffix, } } } @@ -115,6 +108,18 @@ impl fmt::Display for StringLit { } } +/// Precondition: input has to start with either `"` or `r`. +#[inline(never)] +pub(crate) fn parse_impl(input: &str) -> Result<(Option, Option, usize), ParseError> { + if input.starts_with('r') { + scan_raw_string::(&input, 1) + .map(|(v, hashes, start_suffix)| (v, Some(hashes), start_suffix)) + } else { + unescape_string::(&input, 1) + .map(|(v, start_suffix)| (v, None, start_suffix)) + } +} + #[cfg(test)] mod tests; diff --git a/src/string/tests.rs b/src/string/tests.rs index 51519ab..1c0cb63 100644 --- a/src/string/tests.rs +++ b/src/string/tests.rs @@ -4,18 +4,24 @@ use crate::{Literal, StringLit, test_util::{assert_parse_ok_eq, assert_roundtrip macro_rules! check { ($lit:literal, $has_escapes:expr, $num_hashes:expr) => { - let input = stringify!($lit); + check!($lit, stringify!($lit), $has_escapes, $num_hashes, "") + }; + ($lit:literal, $input:expr, $has_escapes:expr, $num_hashes:expr, $suffix:literal) => { + let input = $input; let expected = StringLit { raw: input, value: if $has_escapes { Some($lit.to_string()) } else { None }, num_hashes: $num_hashes, + start_suffix: input.len() - $suffix.len(), }; assert_parse_ok_eq(input, StringLit::parse(input), expected.clone(), "StringLit::parse"); assert_parse_ok_eq( input, Literal::parse(input), Literal::String(expected.clone()), "Literal::parse"); - assert_eq!(StringLit::parse(input).unwrap().value(), $lit); - assert_eq!(StringLit::parse(input).unwrap().into_value(), $lit); + let lit = StringLit::parse(input).unwrap(); + assert_eq!(lit.value(), $lit); + assert_eq!(lit.suffix(), $suffix); + assert_eq!(lit.into_value(), $lit); assert_roundtrip(expected.into_owned(), input); }; } @@ -47,6 +53,7 @@ fn special_whitespace() { raw: &*input, value: None, num_hashes, + start_suffix: input.len(), }; assert_parse_ok_eq( &input, StringLit::parse(&*input), expected.clone(), "StringLit::parse"); @@ -185,6 +192,15 @@ fn raw_string() { check!(r#"さび\n\t\r\0\\x60\u{123}フェリス"#, false, Some(1)); } +#[test] +fn suffixes() { + check!("hello", r###""hello"suffix"###, false, None, "suffix"); + check!(r"お前はもう死んでいる", r###"r"お前はもう死んでいる"_banana"###, false, Some(0), "_banana"); + check!("fox", r#""fox"peter"#, false, None, "peter"); + check!("🦊", r#""🦊"peter"#, false, None, "peter"); + check!("నక్క\\\\u{0b10}", r###""నక్క\\\\u{0b10}"jü_rgen"###, true, None, "jü_rgen"); +} + #[test] fn parse_err() { assert_err!(StringLit, r#"""#, UnterminatedString, None); @@ -192,10 +208,8 @@ fn parse_err() { assert_err!(StringLit, r#""Jürgen"#, UnterminatedString, None); assert_err!(StringLit, r#""foo bar baz"#, UnterminatedString, None); - assert_err!(StringLit, r#""fox"peter"#, UnexpectedChar, 5..10); - assert_err!(StringLit, r#""fox"peter""#, UnexpectedChar, 5..11); - assert_err!(StringLit, r#""fox"🦊"#, UnexpectedChar, 5..9); - assert_err!(StringLit, r###"r#"foo "# bar"#"###, UnexpectedChar, 9..15); + assert_err!(StringLit, r#""fox"peter""#, InvalidSuffix, 5); + assert_err!(StringLit, r###"r#"foo "# bar"#"###, UnexpectedChar, 9); assert_err!(StringLit, "\"\r\"", IsolatedCr, 1); assert_err!(StringLit, "\"fo\rx\"", IsolatedCr, 3); @@ -225,10 +239,10 @@ fn invald_ascii_escapes() { } #[test] -fn invald_escapes() { +fn invalid_escapes() { assert_err!(StringLit, r#""\a""#, UnknownEscape, 1..3); assert_err!(StringLit, r#""foo\y""#, UnknownEscape, 4..6); - assert_err!(StringLit, r#""\"#, UnterminatedString, None); + assert_err!(StringLit, r#""\"#, UnterminatedEscape, 1); assert_err!(StringLit, r#""\x""#, UnterminatedEscape, 1..3); assert_err!(StringLit, r#""🦊\x1""#, UnterminatedEscape, 5..8); assert_err!(StringLit, r#"" \xaj""#, InvalidXEscape, 2..6); diff --git a/src/tests.rs b/src/tests.rs index 039aa56..613b429 100644 --- a/src/tests.rs +++ b/src/tests.rs @@ -25,25 +25,16 @@ fn invalid_literals() { #[test] fn misc() { - assert_err_single!(Literal::parse("0x44.5"), InvalidIntegerTypeSuffix, 4..6); + assert_err_single!(Literal::parse("0x44.5"), UnexpectedChar, 4..6); assert_err_single!(Literal::parse("a"), InvalidLiteral, None); assert_err_single!(Literal::parse(";"), InvalidLiteral, None); assert_err_single!(Literal::parse("0;"), UnexpectedChar, 1); - assert_err_single!(Literal::parse("0a"), UnexpectedChar, 1); - assert_err_single!(Literal::parse("0z"), UnexpectedChar, 1); assert_err_single!(Literal::parse(" 0"), InvalidLiteral, None); assert_err_single!(Literal::parse("0 "), UnexpectedChar, 1); - assert_err_single!(Literal::parse("0a3"), UnexpectedChar, 1); - assert_err_single!(Literal::parse("0z3"), UnexpectedChar, 1); assert_err_single!(Literal::parse("_"), InvalidLiteral, None); assert_err_single!(Literal::parse("_3"), InvalidLiteral, None); - assert_err_single!(Literal::parse("12a3"), UnexpectedChar, 2); - assert_err_single!(Literal::parse("12f3"), InvalidFloatTypeSuffix, 2..4); - assert_err_single!(Literal::parse("12f_"), InvalidFloatTypeSuffix, 2..4); - assert_err_single!(Literal::parse("12F_"), UnexpectedChar, 2); assert_err_single!(Literal::parse("a_123"), InvalidLiteral, None); assert_err_single!(Literal::parse("B_123"), InvalidLiteral, None); - assert_err_single!(Literal::parse("54321a64"), UnexpectedChar, 5); } macro_rules! assert_no_panic {