diff --git a/Cargo.lock b/Cargo.lock index ffc1f0dec1d9a..9d0182d2e2702 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3688,6 +3688,7 @@ dependencies = [ name = "rustc_macros" version = "0.1.0" dependencies = [ + "indexmap", "proc-macro2", "quote", "syn", diff --git a/compiler/rustc_macros/Cargo.toml b/compiler/rustc_macros/Cargo.toml index 73eb0dd56d772..a264a887c593d 100644 --- a/compiler/rustc_macros/Cargo.toml +++ b/compiler/rustc_macros/Cargo.toml @@ -8,6 +8,7 @@ edition = "2018" proc-macro = true [dependencies] +indexmap = "1" synstructure = "0.12.1" syn = { version = "1", features = ["full"] } proc-macro2 = "1" diff --git a/compiler/rustc_macros/src/symbols.rs b/compiler/rustc_macros/src/symbols.rs index 352665f0ab199..052960493afd0 100644 --- a/compiler/rustc_macros/src/symbols.rs +++ b/compiler/rustc_macros/src/symbols.rs @@ -1,5 +1,8 @@ +use indexmap::IndexMap; use proc_macro::TokenStream; +use proc_macro2::TokenTree; use quote::quote; +use std::collections::hash_map::RandomState; use std::collections::HashSet; use syn::parse::{Parse, ParseStream, Result}; use syn::{braced, parse_macro_input, Ident, LitStr, Token}; @@ -44,22 +47,43 @@ impl Parse for Symbol { } } -/// A type used to greedily parse another type until the input is empty. -struct List(Vec); +// Map from an optional keyword class to the list of keywords in it. +// FIXME: the indexmap crate thinks `has_std` is false when building `rustc_macros`, +// so we have to provide the hasher manually. +struct Keywords(IndexMap, Vec, RandomState>); -impl Parse for List { +impl Parse for Keywords { + fn parse(input: ParseStream<'_>) -> Result { + let mut classes = IndexMap::<_, Vec<_>, _>::with_hasher(Default::default()); + let mut current_class = None; + while !input.is_empty() { + if input.peek(Token![fn]) { + input.parse::()?; + current_class = Some(input.parse::()?); + input.parse::()?; + } else { + classes.entry(current_class.clone()).or_default().push(input.parse()?); + } + } + Ok(Keywords(classes)) + } +} + +struct Symbols(Vec); + +impl Parse for Symbols { fn parse(input: ParseStream<'_>) -> Result { let mut list = Vec::new(); while !input.is_empty() { list.push(input.parse()?); } - Ok(List(list)) + Ok(Symbols(list)) } } struct Input { - keywords: List, - symbols: List, + keywords: Keywords, + symbols: Symbols, } impl Parse for Input { @@ -78,14 +102,25 @@ impl Parse for Input { } } +/// WARNING: this function must behave equivalently to +/// `Symbol::try_new_inlined()`. It does, modulo the fact that it accepts fewer +/// inputs, panicking on any string containing non-ASCII or NUL bytes. This is +/// fine because static symbols never contain such bytes. Once those bytes are +/// excluded, it reduces to a mere length check. +fn is_inlinable(s: &str) -> bool { + assert!(s.as_bytes().iter().all(|&b| 0 < b && b < 0x80)); + s.len() <= 4 +} + pub fn symbols(input: TokenStream) -> TokenStream { let input = parse_macro_input!(input as Input); let mut keyword_stream = quote! {}; let mut symbols_stream = quote! {}; let mut digits_stream = quote! {}; - let mut prefill_stream = quote! {}; - let mut counter = 0u32; + let mut prefill_tabled_stream = quote! {}; + let mut tabled_counter = 0u32; + let mut keyword_class_stream = quote! {}; let mut keys = HashSet::::new(); let mut prev_key: Option = None; let mut errors = Vec::::new(); @@ -106,18 +141,42 @@ pub fn symbols(input: TokenStream) -> TokenStream { }; // Generate the listed keywords. - for keyword in &input.keywords.0 { - let name = &keyword.name; - let value = &keyword.value; - check_dup(&value.value(), &mut errors); - prefill_stream.extend(quote! { - #value, - }); - keyword_stream.extend(quote! { - #[allow(non_upper_case_globals)] - pub const #name: Symbol = Symbol::new(#counter); - }); - counter += 1; + for (class, keywords) in &input.keywords.0 { + let mut class_stream = quote! {}; + for keyword in keywords { + let name = &keyword.name; + let value = &keyword.value; + let v = value.value(); + check_dup(&v, &mut errors); + if is_inlinable(&v) { + keyword_stream.extend(quote! { + #[allow(non_upper_case_globals)] + pub const #name: Symbol = Symbol::new_inlined(#value); + }); + } else { + prefill_tabled_stream.extend(quote! { + #value, + }); + keyword_stream.extend(quote! { + #[allow(non_upper_case_globals)] + pub const #name: Symbol = Symbol::new_tabled(#tabled_counter); + }); + tabled_counter += 1; + } + class_stream.extend(quote! { + | kw::#name + }); + } + if let Some(class) = class { + keyword_class_stream.extend(quote! { + fn #class(self) -> bool { + match self { + #class_stream => true, + _ => false + } + } + }); + } } // Generate the listed symbols. @@ -129,28 +188,32 @@ pub fn symbols(input: TokenStream) -> TokenStream { }; check_dup(&value, &mut errors); check_order(&name.to_string(), &mut errors); - prefill_stream.extend(quote! { - #value, - }); - symbols_stream.extend(quote! { - #[allow(rustc::default_hash_types)] - #[allow(non_upper_case_globals)] - pub const #name: Symbol = Symbol::new(#counter); - }); - counter += 1; + if is_inlinable(&value) { + symbols_stream.extend(quote! { + #[allow(rustc::default_hash_types)] + #[allow(non_upper_case_globals)] + pub const #name: Symbol = Symbol::new_inlined(#value); + }); + } else { + prefill_tabled_stream.extend(quote! { + #value, + }); + symbols_stream.extend(quote! { + #[allow(rustc::default_hash_types)] + #[allow(non_upper_case_globals)] + pub const #name: Symbol = Symbol::new_tabled(#tabled_counter); + }); + tabled_counter += 1; + } } // Generate symbols for the strings "0", "1", ..., "9". for n in 0..10 { let n = n.to_string(); check_dup(&n, &mut errors); - prefill_stream.extend(quote! { - #n, - }); digits_stream.extend(quote! { - Symbol::new(#counter), + Symbol::new_inlined(#n), }); - counter += 1; } if !errors.is_empty() { @@ -180,11 +243,15 @@ pub fn symbols(input: TokenStream) -> TokenStream { impl Interner { pub fn fresh() -> Self { - Interner::prefill(&[ - #prefill_stream + Interner::prefill_tabled(&[ + #prefill_tabled_stream ]) } } + + impl Symbol { + #keyword_class_stream + } }); // To see the generated code generated, uncomment this line, recompile, and diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index 5092b945f72c4..47f8de8c4622b 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -11,6 +11,7 @@ use rustc_serialize::{Decodable, Decoder, Encodable, Encoder}; use std::cmp::{Ord, PartialEq, PartialOrd}; use std::fmt; use std::hash::{Hash, Hasher}; +use std::ops::Deref; use std::str; use crate::{Span, DUMMY_SP, SESSION_GLOBALS}; @@ -25,12 +26,14 @@ symbols! { Keywords { // Special reserved identifiers used internally for elided lifetimes, // unnamed method parameters, crate root module, error recovery etc. + fn is_special: Invalid: "", PathRoot: "{{root}}", DollarCrate: "$crate", Underscore: "_", - // Keywords that are used in stable Rust. + // Keywords that are used in stable Rust on all editions. + fn is_used_keyword_20xx: As: "as", Break: "break", Const: "const", @@ -67,7 +70,8 @@ symbols! { Where: "where", While: "while", - // Keywords that are used in unstable Rust or reserved for future use. + // Keywords that are used in unstable Rust or reserved for future use on all editions. + fn is_unused_keyword_20xx: Abstract: "abstract", Become: "become", Box: "box", @@ -82,18 +86,22 @@ symbols! { Yield: "yield", // Edition-specific keywords that are used in stable Rust. + fn is_used_keyword_2018: Async: "async", // >= 2018 Edition only Await: "await", // >= 2018 Edition only Dyn: "dyn", // >= 2018 Edition only // Edition-specific keywords that are used in unstable Rust or reserved for future use. + fn is_unused_keyword_2018: Try: "try", // >= 2018 Edition only // Special lifetime names + fn is_special_lifetime: UnderscoreLifetime: "'_", StaticLifetime: "'static", // Weak keywords, have special meaning only in specific contexts. + fn is_weak_keyword: Auto: "auto", Catch: "catch", Default: "default", @@ -1370,13 +1378,69 @@ impl fmt::Display for MacroRulesNormalizedIdent { /// An interned string. /// -/// Internally, a `Symbol` is implemented as an index, and all operations -/// (including hashing, equality, and ordering) operate on that index. The use -/// of `rustc_index::newtype_index!` means that `Option` only takes up 4 bytes, -/// because `rustc_index::newtype_index!` reserves the last 256 values for tagging purposes. +/// Internally, a `Symbol` is implemented as a u32, and all operations +/// (including hashing, equality, and ordering) operate on that u32. The use of +/// `rustc_index::newtype_index!` means that `Option` only takes up 4 +/// bytes, because `rustc_index::newtype_index!` reserves the last 256 values +/// for tagging purposes. /// -/// Note that `Symbol` cannot directly be a `rustc_index::newtype_index!` because it -/// implements `fmt::Debug`, `Encodable`, and `Decodable` in special ways. +/// Note that `Symbol` cannot directly be a `rustc_index::newtype_index!` +/// because it implements `fmt::Debug`, `Encodable`, and `Decodable` in special +/// ways. +/// +/// For the interner in `SESSION_GLOBALS`, we use a two-part encoding. Strings +/// of length 4 or less (with some exceptions due to two encoding constraints +/// described below) are "inlined", i.e. stored within the u32 itself. Other +/// strings are "tabled", i.e. stored in the interner's table and the u32 is an +/// index. This helps performance because strings of length 4 are common +/// (roughly 50% of cases in practice) and those ones avoid the need to lock +/// and search the hash table. +/// +/// The details of the encoding are as follows. +/// +/// - The highest bit of the u32 is a tag bit. +/// +/// - If the tag bit is 0, the symbol is inlined. The string bytes +/// (`s`) are encoded into the u32 (`u`) in a little-endian fashion: +/// - Byte 0 of `u` holds `s[0]`, or zero if `s.len()` is < 1. +/// - Byte 1 of `u` holds `s[1]`, or zero if `s.len()` is < 2. +/// - Byte 2 of `u` holds `s[2]`, or zero if `s.len()` is < 3. +/// - Byte 3 of `u` holds `s[3]`, or zero if `s.len()` is < 4. +/// +/// Some examples: +/// - "" -> 0x00_00_00_00 +/// - "a" -> 0x00_00_00_61 +/// - "ab" -> 0x00_00_62_61 +/// - "abc" -> 0x00_63_62_61 +/// - "abcd" -> 0x64_63_62_61 +/// +/// Because byte 3 contains the tag bit, for a string of length four to be +/// inlined its final byte must be < 0x80. For example, "cdé" can not be +/// inlined because in UTF-8 it is a four byte sequence `[0x64, 0x65, 0xc3, +/// 0xa9]` and byte 3 is > 0x7f. (This is the first of the abovementioned +/// encoding constraints.) +/// +/// The length of the string is not explicitly encoded. Rather, it is equal +/// to the index of the most-significant zero byte, or four if there are no +/// zero bytes. (For example, in 0x00_63_62_61 the most significant zero byte +/// is byte 3, and so the length is 3.) Because of this, a string whose final +/// char is a NUL char cannot be represented inline. For example, none of +/// "\0", "a\0", "ab\0", and "abc\0" can be inlined. (This is the second of +/// the abovementioned encoding constraints.) +/// +/// - If the tag bit is 1, the symbol is tabled. The high bit must be removed +/// before being used as an index. For example, the fifth tabled symbol will +/// have the value 0x80_00_00_05. +/// +/// The maximum value of a `newtype_index` is 0xff_ff_ff_00, due to highest +/// 256 values being reserved. Therefore, the maximum index representable is +/// 0x7f_ff_ff_00 (2,147,483,392). Given that the maximum number of bytes in +/// a crate is 0xff_ff_ff_ff, this should be more than enough for tabled +/// symbols. +/// +/// For interners other than the one in `SESSION_GLOBALS` (which are rare and +/// not that performance-critical) all symbols are interned, i.e. the u32 is an +/// index into the interner's table with its high bit set. #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)] pub struct Symbol(SymbolIndex); @@ -1384,30 +1448,51 @@ rustc_index::newtype_index! { pub struct SymbolIndex { .. } } +/// These `Symbol` methods work with just the `Symbol` itself, and do not +/// involve `SESSION_GLOBALS`. impl Symbol { - const fn new(n: u32) -> Self { - Symbol(SymbolIndex::from_u32(n)) + /// Try to create an inlined symbol from a string. + /// WARNING: this function must behave equivalently to `is_inlinable()` in + /// `librustc_macros/src/symbol.rs`. + const fn try_new_inlined(s: &str) -> Option { + let len = s.len(); + let s = s.as_bytes(); + let n = if len == 4 && s[3] != 0 && s[3] < 0x80 { + s[0] as u32 | ((s[1] as u32) << 8) | ((s[2] as u32) << 16) | ((s[3] as u32) << 24) + } else if len == 3 && s[2] != 0 { + s[0] as u32 | ((s[1] as u32) << 8) | ((s[2] as u32) << 16) + } else if len == 2 && s[1] != 0 { + s[0] as u32 | ((s[1] as u32) << 8) + } else if len == 1 && s[0] != 0 { + s[0] as u32 + } else if len == 0 { + 0u32 + } else { + return None; + }; + Some(Symbol(SymbolIndex::from_u32(n))) } - /// Maps a string to its interned representation. - pub fn intern(string: &str) -> Self { - with_interner(|interner| interner.intern(string)) + /// Create an inlined symbol from a string. Panic if that's not possible. + const fn new_inlined(s: &str) -> Self { + match Symbol::try_new_inlined(s) { + Some(s) => s, + None => panic!("non-inlinable string"), + } } - /// Access the symbol's chars. This is a slowish operation because it - /// requires locking the symbol interner. - pub fn with R, R>(self, f: F) -> R { - with_interner(|interner| f(interner.get(self))) + /// Create a tabled symbol from an interner index. + const fn new_tabled(index: u32) -> Self { + Symbol(SymbolIndex::from_u32(0x80000000 | index)) } - /// Convert to a `SymbolStr`. This is a slowish operation because it - /// requires locking the symbol interner. - pub fn as_str(self) -> SymbolStr { - with_interner(|interner| unsafe { - SymbolStr { string: std::mem::transmute::<&str, &str>(interner.get(self)) } - }) + /// Convert a tabled symbol to an index for the interner. + fn as_tabled_index(self) -> usize { + debug_assert_ne!(self.0.as_u32() & 0x80000000, 0); + (self.0.as_u32() & !0x80000000) as usize } + /// Extract the inner u32 value. pub fn as_u32(self) -> u32 { self.0.as_u32() } @@ -1421,6 +1506,51 @@ impl Symbol { } } +// These `Symbol` methods are for accessing symbols via `SESSION_GLOBALS`. +impl Symbol { + /// Map a string to its symbol representation, using the interner's table + /// from `SESSION_GLOBALS` if necessary (a relatively slow operation). + pub fn intern(string: &str) -> Self { + if let Some(sym) = Symbol::try_new_inlined(string) { + sym + } else { + with_interner(|interner| interner.intern(string)) + } + } + + /// Access the symbol's chars, using the interner's table from + /// `SESSION_GLOBALS` if necessary (a relatively slow operation). + pub fn with R, R>(self, f: F) -> R { + f(self.as_str().deref()) + } + + /// Convert to a `SymbolStr`, using the interner's table from + /// `SESSION_GLOBALS` if necessary (a relatively slow operation). + pub fn as_str(self) -> SymbolStr { + if self.0.as_u32() & 0x80000000 == 0 { + // The high bit is clear, it's an inlined symbol. + let bytes = self.0.as_u32().to_le_bytes(); + let len = if bytes[3] != 0 { + 4 + } else if bytes[2] != 0 { + 3 + } else if bytes[1] != 0 { + 2 + } else if bytes[0] != 0 { + 1 + } else { + 0 + }; + SymbolStr::Inlined { bytes, len } + } else { + // The high bit is set, it's a tabled symbol. + with_interner(|interner| unsafe { + SymbolStr::Tabled { string: std::mem::transmute::<&str, &str>(interner.get(self)) } + }) + } + } +} + impl fmt::Debug for Symbol { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.with(|str| fmt::Debug::fmt(&str, f)) @@ -1462,6 +1592,9 @@ impl ToStableHashKey for Symbol { } } +/// A string interner. The interner in `SESSION_GLOBALS` can (and should) be +/// accessed via `Symbol` methods. Non-global interners can use `Interner` +/// methods directly. // The `&'static str`s in this type actually point into the arena. // // The `FxHashMap`+`Vec` pair could be replaced by `FxIndexSet`, but #75278 @@ -1475,24 +1608,33 @@ pub struct Interner { } impl Interner { - fn prefill(init: &[&'static str]) -> Self { + /// Prefill the interner table with static symbols. This should only be + /// done for the `SESSION_GLOBALS` interner. + fn prefill_tabled(init: &[&'static str]) -> Self { Interner { strings: init.into(), - names: init.iter().copied().zip((0..).map(Symbol::new)).collect(), + names: init + .iter() + .inspect(|s| assert_eq!(Symbol::try_new_inlined(s), None)) + .copied() + .zip((0..).map(Symbol::new_tabled)) + .collect(), ..Default::default() } } - #[inline] + /// Map a string to its symbol representation, using this interner's table + /// if necessary. For the interner in `SESSION_GLOBALS`, `Symbol::intern()` + /// should be used in preference to this function. pub fn intern(&mut self, string: &str) -> Symbol { if let Some(&name) = self.names.get(string) { return name; } - let name = Symbol::new(self.strings.len() as u32); + let name = Symbol::new_tabled(self.strings.len() as u32); - // `from_utf8_unchecked` is safe since we just allocated a `&str` which is known to be - // UTF-8. + // SAFETY: `from_utf8_unchecked` is safe because the input is a copy of + // `string` which is a `&str` and therefore must be UTF-8. let string: &str = unsafe { str::from_utf8_unchecked(self.arena.alloc_slice(string.as_bytes())) }; // It is safe to extend the arena allocation to `'static` because we only access @@ -1503,10 +1645,11 @@ impl Interner { name } - // Get the symbol as a string. `Symbol::as_str()` should be used in - // preference to this function. + // Get the symbol as a string. For the interner in `SESSION_GLOBALS`, + // `Symbol::as_str()` or `Symbol::with()` should be used in preference to + // this function. pub fn get(&self, symbol: Symbol) -> &str { - self.strings[symbol.0.as_usize()] + self.strings[symbol.as_tabled_index()] } } @@ -1548,19 +1691,6 @@ pub mod sym { } impl Symbol { - fn is_used_keyword_2018(self) -> bool { - self >= kw::Async && self <= kw::Dyn - } - - fn is_unused_keyword_2018(self) -> bool { - self == kw::Try - } - - /// Used for sanity checking rustdoc keyword sections. - pub fn is_doc_keyword(self) -> bool { - self <= kw::Union - } - /// A keyword or reserved identifier that can be used as a path segment. pub fn is_path_segment_keyword(self) -> bool { self == kw::Super @@ -1586,20 +1716,20 @@ impl Ident { // Returns `true` for reserved identifiers used internally for elided lifetimes, // unnamed method parameters, crate root module, error recovery etc. pub fn is_special(self) -> bool { - self.name <= kw::Underscore + self.name.is_special() } /// Returns `true` if the token is a keyword used in the language. pub fn is_used_keyword(self) -> bool { // Note: `span.edition()` is relatively expensive, don't call it unless necessary. - self.name >= kw::As && self.name <= kw::While + self.name.is_used_keyword_20xx() || self.name.is_used_keyword_2018() && self.span.rust_2018() } /// Returns `true` if the token is a keyword reserved for possible future use. pub fn is_unused_keyword(self) -> bool { // Note: `span.edition()` is relatively expensive, don't call it unless necessary. - self.name >= kw::Abstract && self.name <= kw::Yield + self.name.is_unused_keyword_20xx() || self.name.is_unused_keyword_2018() && self.span.rust_2018() } @@ -1636,15 +1766,20 @@ fn with_interner T>(f: F) -> T { // FIXME: ensure that the interner outlives any thread which uses `SymbolStr`, // by creating a new thread right after constructing the interner. #[derive(Clone, Eq, PartialOrd, Ord)] -pub struct SymbolStr { - string: &'static str, +pub enum SymbolStr { + /// For an inlined symbol this type needs its own bytes, because the + /// original bytes are embedded in the u32 within the `Symbol`. + Inlined { bytes: [u8; 4], len: usize }, + + /// For a tabled symbol a `&str` suffices. + Tabled { string: &'static str }, } // This impl allows a `SymbolStr` to be directly equated with a `String` or // `&str`. impl> std::cmp::PartialEq for SymbolStr { fn eq(&self, other: &T) -> bool { - self.string == other.deref() + self.deref() == other.deref() } } @@ -1660,26 +1795,32 @@ impl std::ops::Deref for SymbolStr { type Target = str; #[inline] fn deref(&self) -> &str { - self.string + match self { + // SAFETY: the bytes originally came from a `&str`. + SymbolStr::Inlined { bytes, len } => unsafe { + std::str::from_utf8_unchecked(&bytes[0..*len]) + }, + SymbolStr::Tabled { string } => string, + } } } impl fmt::Debug for SymbolStr { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Debug::fmt(self.string, f) + fmt::Debug::fmt(self.deref(), f) } } impl fmt::Display for SymbolStr { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - fmt::Display::fmt(self.string, f) + fmt::Display::fmt(self.deref(), f) } } impl HashStable for SymbolStr { #[inline] fn hash_stable(&self, hcx: &mut CTX, hasher: &mut StableHasher) { - self.string.hash_stable(hcx, hasher) + self.deref().hash_stable(hcx, hasher) } } diff --git a/compiler/rustc_span/src/symbol/tests.rs b/compiler/rustc_span/src/symbol/tests.rs index 47da03424b770..a0cd9da6d3493 100644 --- a/compiler/rustc_span/src/symbol/tests.rs +++ b/compiler/rustc_span/src/symbol/tests.rs @@ -3,23 +3,99 @@ use super::*; use crate::{edition, SessionGlobals}; #[test] -fn interner_tests() { - let mut i: Interner = Interner::default(); - // first one is zero: - assert_eq!(i.intern("dog"), Symbol::new(0)); - // re-use gets the same entry: - assert_eq!(i.intern("dog"), Symbol::new(0)); - // different string gets a different #: - assert_eq!(i.intern("cat"), Symbol::new(1)); - assert_eq!(i.intern("cat"), Symbol::new(1)); - // dog is still at zero - assert_eq!(i.intern("dog"), Symbol::new(0)); +fn symbol_tests() { + let sym = |n| Some(Symbol(SymbolIndex::from_u32(n))); + + // Simple ASCII symbols. + assert_eq!(Symbol::try_new_inlined(""), sym(0x00_00_00_00)); + assert_eq!(Symbol::try_new_inlined("a"), sym(0x00_00_00_61)); + assert_eq!(Symbol::try_new_inlined("ab"), sym(0x00_00_62_61)); + assert_eq!(Symbol::try_new_inlined("abc"), sym(0x00_63_62_61)); + assert_eq!(Symbol::try_new_inlined("abcd"), sym(0x64_63_62_61)); + assert_eq!(Symbol::try_new_inlined("abcde"), None); // too long + assert_eq!(Symbol::try_new_inlined("abcdefghijklmnopqrstuvwxyz"), None); // too long + + // Symbols involving non-ASCII chars. + // Note that the UTF-8 sequence for 'é' is `[0xc3, 0xa9]`. + assert_eq!(Symbol::try_new_inlined("é"), sym(0x00_00_a9_c3)); + assert_eq!(Symbol::try_new_inlined("dé"), sym(0x00_a9_c3_64)); + assert_eq!(Symbol::try_new_inlined("édc"), sym(0x63_64_a9_c3)); + assert_eq!(Symbol::try_new_inlined("cdé"), None); // byte 3 (0xa9) is > 0x7f + + // Symbols involving NUL chars. + assert_eq!(Symbol::try_new_inlined("\0"), None); // last byte is NUL + assert_eq!(Symbol::try_new_inlined("a\0"), None); // last byte is NUL + assert_eq!(Symbol::try_new_inlined("\0a"), sym(0x00_00_61_00)); + assert_eq!(Symbol::try_new_inlined("aa\0"), None); // last byte is NUL + assert_eq!(Symbol::try_new_inlined("\0\0a"), sym(0x00_61_00_00)); + assert_eq!(Symbol::try_new_inlined("aaa\0"), None); // last byte is NUL + assert_eq!(Symbol::try_new_inlined("\0\0\0a"), sym(0x61_00_00_00)); + + // Tabled symbols. + assert_eq!(Symbol::new_tabled(0).as_u32(), 0x80000000); + assert_eq!(Symbol::new_tabled(5).as_u32(), 0x80000005); + assert_eq!(Symbol::new_tabled(0x123456).as_u32(), 0x80123456); + + // Tabled symbol indices. + assert_eq!(Symbol::new_tabled(0).as_tabled_index(), 0); + assert_eq!(Symbol::new_tabled(5).as_tabled_index(), 5); + assert_eq!(Symbol::new_tabled(0x123456).as_tabled_index(), 0x123456); } #[test] -fn without_first_quote_test() { +fn symbol_interner_tests() { SESSION_GLOBALS.set(&SessionGlobals::new(edition::DEFAULT_EDITION), || { + let inlined = |s, n, len| { + // Check the symbol and the deinterned string look right. + let sym = Symbol::intern(s); + assert_eq!(sym.as_u32(), n); + sym.with(|w| w == s); + assert_eq!(sym.as_str(), s); + assert_eq!(sym.as_str().len(), len); + }; + + let tabled = |s, len| { + // Check the symbol and the deinterned string look right. + let sym = Symbol::intern(s); + assert!(sym.as_u32() & 0x80000000 != 0); + sym.with(|w| w == s); + assert_eq!(sym.as_str(), s); + assert_eq!(sym.as_str().len(), len); + }; + + // Inlined symbols, lengths 1..=4. + // Note that the UTF-8 sequence for 'é' is `[0xc3, 0xa9]`. + inlined("", 0x00_00_00_00, 0); + inlined("a", 0x00_00_00_61, 1); + inlined("é", 0x00_00_a9_c3, 2); + inlined("dé", 0x00_a9_c3_64, 3); + inlined("édc", 0x63_64_a9_c3, 4); + + // Tabled symbols. + tabled("abcde", 5); // tabled due to length + tabled("cdé", 4); // tabled due to the fourth byte being > 0x7f + tabled("a\0", 2); // tabled due to the last byte being NUL + + // Test `without_first_quote()`. let i = Ident::from_str("'break"); assert_eq!(i.without_first_quote().name, kw::Break); }); } + +#[test] +fn interner_tests() { + let mut i: Interner = Interner::default(); + + // Note that going directly through `Interner` means that no inlined + // symbols are made. + + // First long one is zero. + assert_eq!(i.intern("dog"), Symbol::new_tabled(0)); + // Re-use gets the same entry. + assert_eq!(i.intern("dog"), Symbol::new_tabled(0)); + // Different string gets a different index. + assert_eq!(i.intern("salamander"), Symbol::new_tabled(1)); + assert_eq!(i.intern("salamander"), Symbol::new_tabled(1)); + // Dog is still at zero. + assert_eq!(i.intern("dog"), Symbol::new_tabled(0)); +} diff --git a/src/librustdoc/clean/mod.rs b/src/librustdoc/clean/mod.rs index 14e0fa7eabb00..aa197343ea901 100644 --- a/src/librustdoc/clean/mod.rs +++ b/src/librustdoc/clean/mod.rs @@ -170,11 +170,8 @@ impl Clean for CrateNum { for attr in attrs.lists(sym::doc) { if let Some(v) = attr.value_str() { if attr.has_name(sym::keyword) { - if v.is_doc_keyword() { - keyword = Some(v.to_string()); - break; - } - // FIXME: should warn on unknown keywords? + keyword = Some(v.to_string()); + break; } } }