Skip to content

Commit b9e3ee5

Browse files
committed
Add a very small owned string type for the word index
`TinyBoxedStr` is a small-string optimized replacement for `Box<str>` styled after <https://cedardb.com/blog/german_strings/>. A nearly identical type is used in helix-editor/spellbook for space savings. This type is specialized for its use-case: * strings are immutable after creation * strings are very very small (less than 256 bytes long) Because of these attributes we can use nearly the full size of the type for the inline representation and also keep a small total size. Many other small string crates in the wild are 3 `usize`s long (same as a regular `String`) to support mutability. This type is more like a `Box<str>` (2 `usize`s long). Mostly I like this small string type though because it's very straightforward to implement. Other than a few functions that reach into `std::alloc` or `std::ptr`, the code is short and boring.
1 parent 38e1e27 commit b9e3ee5

File tree

3 files changed

+222
-5
lines changed

3 files changed

+222
-5
lines changed

helix-stdx/src/lib.rs

+1
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,6 @@ pub mod faccess;
33
pub mod path;
44
pub mod range;
55
pub mod rope;
6+
pub mod str;
67

78
pub use range::Range;

helix-stdx/src/str.rs

+209
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
use std::{
2+
alloc,
3+
borrow::Borrow,
4+
fmt, hash,
5+
mem::{size_of, ManuallyDrop},
6+
ptr::{self, NonNull},
7+
slice, str,
8+
};
9+
10+
/// A very very small owned string type.
11+
///
12+
/// This type is like a `Box<str>` but can only fit strings with a byte length smaller than 256.
13+
/// On 64-bit machines this type stores up to 15 bytes inline (7 bytes on 32-bit machines). One
14+
/// byte is used to store the length. For strings short enough to be stored inline, the remaining
15+
/// 15 (or 7) bytes store the content inline. Otherwise the second `usize` of memory is a thin
16+
/// pointer to the string content.
17+
///
18+
/// Unlike `Box<str>` this type is not null-pointer optimized.
19+
#[repr(C)]
20+
pub struct TinyBoxedStr {
21+
len: u8,
22+
prefix: [u8; Self::PREFIX_LEN],
23+
trailing: TinyStrTrailing,
24+
}
25+
26+
#[repr(C)]
27+
union TinyStrTrailing {
28+
suffix: [u8; TinyBoxedStr::SUFFIX_LEN],
29+
ptr: ManuallyDrop<NonNull<u8>>,
30+
}
31+
32+
impl TinyBoxedStr {
33+
const PREFIX_LEN: usize = size_of::<usize>() - size_of::<u8>();
34+
const SUFFIX_LEN: usize = size_of::<usize>();
35+
const INLINE_LEN: u8 = (Self::PREFIX_LEN + Self::SUFFIX_LEN) as u8;
36+
37+
pub const MAX_LEN: usize = u8::MAX as usize;
38+
39+
#[inline]
40+
pub fn len(&self) -> usize {
41+
self.len as usize
42+
}
43+
44+
#[inline]
45+
pub fn is_empty(&self) -> bool {
46+
self.len == 0
47+
}
48+
49+
pub fn as_bytes(&self) -> &[u8] {
50+
let ptr = if self.len <= Self::INLINE_LEN {
51+
let ptr = ptr::from_ref(self);
52+
unsafe { ptr::addr_of!((*ptr).prefix) }.cast()
53+
} else {
54+
unsafe { self.trailing.ptr }.as_ptr()
55+
};
56+
unsafe { slice::from_raw_parts(ptr, self.len()) }
57+
}
58+
59+
#[inline]
60+
pub fn as_str(&self) -> &str {
61+
unsafe { str::from_utf8_unchecked(self.as_bytes()) }
62+
}
63+
64+
fn layout(len: usize) -> alloc::Layout {
65+
alloc::Layout::array::<u8>(len)
66+
.expect("a valid layout for an array")
67+
.pad_to_align()
68+
}
69+
70+
fn copy_bytes(source: &[u8]) -> NonNull<u8> {
71+
let layout = Self::layout(source.len());
72+
let nullable = unsafe { alloc::alloc(layout) };
73+
let ptr = match NonNull::new(nullable) {
74+
Some(ptr) => ptr.cast(),
75+
None => alloc::handle_alloc_error(layout),
76+
};
77+
unsafe {
78+
ptr::copy_nonoverlapping(source.as_ptr(), ptr.as_ptr(), source.len());
79+
}
80+
ptr
81+
}
82+
}
83+
84+
#[derive(Debug)]
85+
pub struct TooLongError;
86+
87+
impl TryFrom<&str> for TinyBoxedStr {
88+
type Error = TooLongError;
89+
90+
fn try_from(s: &str) -> Result<Self, Self::Error> {
91+
if s.len() > Self::MAX_LEN {
92+
return Err(TooLongError);
93+
}
94+
95+
let len = s.len() as u8;
96+
let bytes = s.as_bytes();
97+
let mut prefix = [0; Self::PREFIX_LEN];
98+
let trailing = if len <= Self::INLINE_LEN {
99+
let mut suffix = [0; Self::SUFFIX_LEN];
100+
if s.len() <= Self::PREFIX_LEN {
101+
prefix[..s.len()].copy_from_slice(bytes);
102+
} else {
103+
prefix.copy_from_slice(&bytes[..Self::PREFIX_LEN]);
104+
suffix[..s.len() - Self::PREFIX_LEN].copy_from_slice(&bytes[Self::PREFIX_LEN..]);
105+
}
106+
TinyStrTrailing { suffix }
107+
} else {
108+
prefix.copy_from_slice(&bytes[..Self::PREFIX_LEN]);
109+
let ptr = ManuallyDrop::new(Self::copy_bytes(bytes));
110+
TinyStrTrailing { ptr }
111+
};
112+
113+
Ok(Self {
114+
len,
115+
prefix,
116+
trailing,
117+
})
118+
}
119+
}
120+
121+
impl Drop for TinyBoxedStr {
122+
fn drop(&mut self) {
123+
if self.len > Self::INLINE_LEN {
124+
let ptr = unsafe { self.trailing.ptr }.as_ptr();
125+
let layout = Self::layout(self.len());
126+
unsafe { alloc::dealloc(ptr.cast(), layout) }
127+
}
128+
}
129+
}
130+
131+
impl Clone for TinyBoxedStr {
132+
fn clone(&self) -> Self {
133+
let trailing = if self.len <= Self::INLINE_LEN {
134+
let suffix = unsafe { self.trailing.suffix };
135+
TinyStrTrailing { suffix }
136+
} else {
137+
let ptr = ManuallyDrop::new(Self::copy_bytes(self.as_bytes()));
138+
TinyStrTrailing { ptr }
139+
};
140+
141+
Self {
142+
len: self.len,
143+
prefix: self.prefix,
144+
trailing,
145+
}
146+
}
147+
}
148+
149+
impl Default for TinyBoxedStr {
150+
fn default() -> Self {
151+
Self {
152+
len: 0,
153+
prefix: [0; Self::PREFIX_LEN],
154+
trailing: TinyStrTrailing {
155+
suffix: [0; Self::SUFFIX_LEN],
156+
},
157+
}
158+
}
159+
}
160+
161+
impl AsRef<str> for TinyBoxedStr {
162+
fn as_ref(&self) -> &str {
163+
self.as_str()
164+
}
165+
}
166+
167+
impl Borrow<str> for TinyBoxedStr {
168+
fn borrow(&self) -> &str {
169+
self.as_str()
170+
}
171+
}
172+
173+
// NOTE: this could be specialized to optimize the number of comparison operations. We could cast
174+
// the first `usize` of memory together to do a single comparison (and same for the suffixes).
175+
// This optimization would only matter if we compared these strings very frequently however.
176+
impl PartialEq for TinyBoxedStr {
177+
fn eq(&self, other: &Self) -> bool {
178+
self.as_str() == other.as_str()
179+
}
180+
}
181+
182+
impl Eq for TinyBoxedStr {}
183+
184+
impl PartialEq<str> for TinyBoxedStr {
185+
fn eq(&self, other: &str) -> bool {
186+
self.as_str() == other
187+
}
188+
}
189+
190+
impl hash::Hash for TinyBoxedStr {
191+
fn hash<H: hash::Hasher>(&self, state: &mut H) {
192+
self.as_str().hash(state)
193+
}
194+
}
195+
196+
impl fmt::Debug for TinyBoxedStr {
197+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
198+
self.as_str().fmt(f)
199+
}
200+
}
201+
202+
impl fmt::Display for TinyBoxedStr {
203+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
204+
self.as_str().fmt(f)
205+
}
206+
}
207+
208+
unsafe impl Send for TinyBoxedStr {}
209+
unsafe impl Sync for TinyBoxedStr {}

helix-view/src/handlers/word_index.rs

+12-5
Original file line numberDiff line numberDiff line change
@@ -123,8 +123,7 @@ const MIN_WORD_GRAPHEMES: usize = 3;
123123
/// Maximum word length allowed (in chars)
124124
const MAX_WORD_LEN: usize = 50;
125125

126-
// TODO: choose or create a suitable small string type.
127-
type Word = String;
126+
type Word = helix_stdx::str::TinyBoxedStr;
128127

129128
#[derive(Debug, Default)]
130129
struct WordIndexInner {
@@ -142,11 +141,16 @@ impl WordIndexInner {
142141
}
143142

144143
fn insert(&mut self, word: RopeSlice) {
144+
assert!(word.len_chars() <= MAX_WORD_LEN);
145+
// The word must be shorter than `TinyBoxedStr::MAX` because it is fewer than 50
146+
// characters and characters take at most four bytes.
147+
assert!(word.len_bytes() < Word::MAX_LEN);
148+
145149
let word: Cow<str> = word.into();
146150
if let Some(rc) = self.words.get_mut(word.as_ref()) {
147151
*rc = rc.saturating_add(1);
148152
} else {
149-
self.words.insert(word.into_owned(), 1);
153+
self.words.insert(word.as_ref().try_into().unwrap(), 1);
150154
}
151155
}
152156

@@ -172,7 +176,10 @@ impl WordIndex {
172176
let inner = self.inner.read();
173177
let mut matches = fuzzy_match(pattern, inner.words(), false);
174178
matches.sort_unstable_by_key(|(_, score)| *score);
175-
matches.into_iter().map(|(word, _)| word.clone()).collect()
179+
matches
180+
.into_iter()
181+
.map(|(word, _)| word.to_string())
182+
.collect()
176183
}
177184

178185
fn add_document(&self, text: &Rope) {
@@ -409,7 +416,7 @@ mod tests {
409416
impl WordIndex {
410417
fn words(&self) -> HashSet<String> {
411418
let inner = self.inner.read();
412-
inner.words().cloned().collect()
419+
inner.words().map(|w| w.to_string()).collect()
413420
}
414421
}
415422

0 commit comments

Comments
 (0)