diff --git a/src/legacy.rs b/src/legacy.rs index d55f3a1..199b2ed 100644 --- a/src/legacy.rs +++ b/src/legacy.rs @@ -63,11 +63,6 @@ pub fn demangle(s: &str) -> Result<(Demangle, &str), ()> { return Err(()); }; - // only work with ascii text - if inner.bytes().any(|c| c & 0x80 != 0) { - return Err(()); - } - let mut elements = 0; let mut chars = inner.chars(); let mut c = chars.next().ok_or(())?; @@ -87,7 +82,8 @@ pub fn demangle(s: &str) -> Result<(Demangle, &str), ()> { // `c` already contains the first character of this identifier, skip it and // all the other characters of this identifier, to reach the next element. - for _ in 0..len { + while len > 0 { + len = len.checked_sub(c.len_utf8()).ok_or(())?; c = chars.next().ok_or(())?; } diff --git a/src/lib.rs b/src/lib.rs index cafec2f..15d2921 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -74,26 +74,12 @@ enum DemangleStyle<'a> { V0(v0::Demangle<'a>), } -/// De-mangles a Rust symbol into a more readable version -/// -/// This function will take a **mangled** symbol and return a value. When printed, -/// the de-mangled version will be written. If the symbol does not look like -/// a mangled symbol, the original value will be written instead. -/// -/// # Examples -/// -/// ``` -/// use rustc_demangle::demangle; -/// -/// assert_eq!(demangle("_ZN4testE").to_string(), "test"); -/// assert_eq!(demangle("_ZN3foo3barE").to_string(), "foo::bar"); -/// assert_eq!(demangle("foo").to_string(), "foo"); -/// ``` -pub fn demangle(mut s: &str) -> Demangle { +fn demangle_common(s: &str) -> Option<(DemangleStyle, &str)> { // During ThinLTO LLVM may import and rename internal symbols, so strip out // those endings first as they're one of the last manglings applied to symbol // names. let llvm = ".llvm."; + let mut thinlto_stripped = s; if let Some(i) = s.find(llvm) { let candidate = &s[i + llvm.len()..]; let all_hex = candidate.chars().all(|c| match c { @@ -102,21 +88,14 @@ pub fn demangle(mut s: &str) -> Demangle { }); if all_hex { - s = &s[..i]; + thinlto_stripped = &s[..i]; } } - let mut suffix = ""; - let mut style = match legacy::demangle(s) { - Ok((d, s)) => { - suffix = s; - Some(DemangleStyle::Legacy(d)) - } - Err(()) => match v0::demangle(s) { - Ok((d, s)) => { - suffix = s; - Some(DemangleStyle::V0(d)) - } + match legacy::demangle(thinlto_stripped) { + Ok((d, suffix)) => Some((DemangleStyle::Legacy(d), suffix)), + Err(()) => match v0::demangle(thinlto_stripped) { + Ok((d, suffix)) => Some((DemangleStyle::V0(d), suffix)), // FIXME(eddyb) would it make sense to treat an unknown-validity // symbol (e.g. one that errored with `RecursedTooDeep`) as // v0-mangled, and have the error show up in the demangling? @@ -124,65 +103,105 @@ pub fn demangle(mut s: &str) -> Demangle { // will show up in the demangling, if hidden behind a backref) Err(v0::ParseError::Invalid) | Err(v0::ParseError::RecursedTooDeep) => None, }, - }; + } +} - // Output like LLVM IR adds extra period-delimited words. See if - // we are in that case and save the trailing words if so. - if !suffix.is_empty() { - if suffix.starts_with('.') && is_symbol_like(suffix) { - // Keep the suffix. - } else { - // Reset the suffix and invalidate the demangling. - suffix = ""; - style = None; +/// De-mangles a Rust symbol into a more readable version +/// +/// This function will take a **mangled** symbol and return a value. When printed, +/// the de-mangled version will be written. If the symbol does not look like +/// a mangled symbol, the original value will be written instead. +/// +/// # Examples +/// +/// ``` +/// use rustc_demangle::demangle; +/// +/// assert_eq!(demangle("_ZN4testE").to_string(), "test"); +/// assert_eq!(demangle("_ZN3foo3barE").to_string(), "foo::bar"); +/// assert_eq!(demangle("foo").to_string(), "foo"); +/// ``` +pub fn demangle(s: &str) -> Demangle { + if let Some((style, remainder)) = demangle_common(s) { + // Output like LLVM IR adds extra period-delimited words. See if + // we are in that case and save the trailing words if so. + if remainder.is_empty() || (remainder.starts_with('.') && is_llvm_suffix_like(remainder)) { + return Demangle { + style: Some(style), + original: s, + suffix: remainder, + }; } } - Demangle { - style, + return Demangle { + style: None, original: s, - suffix, + suffix: "", + }; +} + +#[cfg(feature = "std")] +fn demangle_partial(s: &str) -> (Demangle, &str) { + if let Some((style, remainder)) = demangle_common(s) { + // Note: suffix is ALWAYS empty because we do not compute the + // LLVM compatibility (nor do we care) + return ( + Demangle { + style: Some(style), + original: s, + suffix: "", + }, + remainder, + ); } + + ( + Demangle { + style: None, + original: s, + suffix: "", + }, + s, + ) } #[cfg(feature = "std")] fn demangle_line( - line: &str, + mut line: &str, output: &mut impl std::io::Write, include_hash: bool, ) -> std::io::Result<()> { - let mut head = 0; - while head < line.len() { + loop { // Move to the next potential match - let next_head = match (line[head..].find("_ZN"), line[head..].find("_R")) { - (Some(idx), None) | (None, Some(idx)) => head + idx, - (Some(idx1), Some(idx2)) => head + idx1.min(idx2), + let next_head = match (line.find("_ZN"), line.find("_R")) { + (Some(idx), None) | (None, Some(idx)) => idx, + (Some(idx1), Some(idx2)) => idx1.min(idx2), (None, None) => { // No more matches... line.len() } }; - output.write_all(line[head..next_head].as_bytes())?; - head = next_head; - // Find the non-matching character. - // - // If we do not find a character, then until the end of the line is the - // thing to demangle. - let match_end = line[head..] - .find(|ch: char| !(ch == '$' || ch == '.' || ch == '_' || ch.is_ascii_alphanumeric())) - .map(|idx| head + idx) - .unwrap_or(line.len()); - - let mangled = &line[head..match_end]; - head = head + mangled.len(); - if let Ok(demangled) = try_demangle(mangled) { + output.write_all(line[..next_head].as_bytes())?; + line = &line[next_head..]; + + if line.is_empty() { + break; + } + + let (demangled, remainder) = demangle_partial(line); + line = remainder; + + if demangled.style.is_some() { if include_hash { write!(output, "{}", demangled)?; } else { write!(output, "{:#}", demangled)?; } } else { - output.write_all(mangled.as_bytes())?; + // there are maybe valid symbols inside this fake one + output.write_all(&line.as_bytes()[..1])?; + line = &line[1..]; } } Ok(()) @@ -250,7 +269,7 @@ impl<'a> Demangle<'a> { } } -fn is_symbol_like(s: &str) -> bool { +fn is_llvm_suffix_like(s: &str) -> bool { s.chars().all(|c| { // Once `char::is_ascii_punctuation` and `char::is_ascii_alphanumeric` // have been stable for long enough, use those instead for clarity @@ -408,6 +427,14 @@ mod tests { t!("_ZN4test1a2bcE", "test::a::bc"); } + #[test] + fn demangle_emoji() { + t_err!("🐇"); + t!("_ZN4🐇E", "🐇"); + t_err!("_ZN4🐇"); + t!("_ZN4🐇1a2bcE", "🐇::a::bc"); + } + #[test] fn demangle_dollars() { t!("_ZN4$RP$E", ")"); @@ -564,7 +591,7 @@ mod tests { #[cfg(feature = "std")] fn demangle_str(input: &str) -> String { let mut output = Vec::new(); - super::demangle_line(input, &mut output, false); + super::demangle_line(input, &mut output, false).unwrap(); String::from_utf8(output).unwrap() } @@ -577,6 +604,15 @@ mod tests { ); } + #[test] + #[cfg(feature = "std")] + fn find_multiple_emoji() { + assert_eq!( + demangle_str("_ZN4🐇E.llvm moocow _ZN4🐇E.llvm"), + "🐇.llvm moocow 🐇.llvm" + ); + } + #[test] #[cfg(feature = "std")] fn interleaved_new_legacy() {