From f4863ec69cabbe0eda815c17f7d787365c8738d2 Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Tue, 25 Mar 2025 12:43:53 -0400 Subject: [PATCH 1/2] Complete words from open buffers --- book/src/editor.md | 19 + book/src/languages.md | 1 + helix-core/src/completion.rs | 1 + helix-core/src/syntax/config.rs | 10 + helix-core/src/transaction.rs | 11 + helix-term/src/handlers.rs | 4 +- helix-term/src/handlers/completion.rs | 4 +- helix-term/src/handlers/completion/request.rs | 9 +- helix-term/src/handlers/completion/word.rs | 134 +++++ helix-view/src/document.rs | 6 + helix-view/src/editor.rs | 22 +- helix-view/src/handlers.rs | 7 + helix-view/src/handlers/word_index.rs | 477 ++++++++++++++++++ 13 files changed, 701 insertions(+), 4 deletions(-) create mode 100644 helix-term/src/handlers/completion/word.rs create mode 100644 helix-view/src/handlers/word_index.rs diff --git a/book/src/editor.md b/book/src/editor.md index b79792058b80..ce04d0a01829 100644 --- a/book/src/editor.md +++ b/book/src/editor.md @@ -19,6 +19,7 @@ - [`[editor.soft-wrap]` Section](#editorsoft-wrap-section) - [`[editor.smart-tab]` Section](#editorsmart-tab-section) - [`[editor.inline-diagnostics]` Section](#editorinline-diagnostics-section) +- [`[editor.word-completion]` Section](#editorword-completion-section) ### `[editor]` Section @@ -474,3 +475,21 @@ end-of-line-diagnostics = "hint" [editor.inline-diagnostics] cursor-line = "warning" # show warnings and errors on the cursorline inline ``` + +### `[editor.word-completion]` Section + +Options for controlling completion of words from open buffers. + +| Key | Description | Default | +| --- | --- | --- | +| `enable` | Whether word completion is enabled | `true` | +| `trigger-length` | Number of word characters to type before triggering completion | `7` | + +Example: + +```toml +[editor.word-completion] +enable = true +# Set the trigger length lower so that words are completed more often +trigger-length = 4 +``` diff --git a/book/src/languages.md b/book/src/languages.md index ea18e9c39efa..cf2f1c23e0ab 100644 --- a/book/src/languages.md +++ b/book/src/languages.md @@ -71,6 +71,7 @@ These configuration keys are available: | `text-width` | Maximum line length. Used for the `:reflow` command and soft-wrapping if `soft-wrap.wrap-at-text-width` is set, defaults to `editor.text-width` | | `rulers` | Overrides the `editor.rulers` config key for the language. | | `path-completion` | Overrides the `editor.path-completion` config key for the language. | +| `word-completion` | Overrides the [`editor.word-completion`](./editor.md#editorword-completion-section) configuration for the language. | | `workspace-lsp-roots` | Directories relative to the workspace root that are treated as LSP roots. Should only be set in `.helix/config.toml`. Overwrites the setting of the same name in `config.toml` if set. | | `persistent-diagnostic-sources` | An array of LSP diagnostic sources assumed unchanged when the language server resends the same set of diagnostics. Helix can track the position for these diagnostics internally instead. Useful for diagnostics that are recomputed on save. diff --git a/helix-core/src/completion.rs b/helix-core/src/completion.rs index 29c86b7380a2..11b49cfc99ff 100644 --- a/helix-core/src/completion.rs +++ b/helix-core/src/completion.rs @@ -16,6 +16,7 @@ pub struct CompletionItem { pub enum CompletionProvider { Lsp(LanguageServerId), Path, + Word, } impl From for CompletionProvider { diff --git a/helix-core/src/syntax/config.rs b/helix-core/src/syntax/config.rs index 432611bb0d38..ddff26f17f5a 100644 --- a/helix-core/src/syntax/config.rs +++ b/helix-core/src/syntax/config.rs @@ -7,6 +7,7 @@ use serde::{ser::SerializeSeq as _, Deserialize, Serialize}; use std::{ collections::{HashMap, HashSet}, fmt::{self, Display}, + num::NonZeroU8, path::PathBuf, str::FromStr, }; @@ -60,6 +61,8 @@ pub struct LanguageConfiguration { /// If set, overrides `editor.path-completion`. pub path_completion: Option, + /// If set, overrides `editor.word-completion`. + pub word_completion: Option, #[serde(default)] pub diagnostic_severity: Severity, @@ -572,6 +575,13 @@ pub struct SoftWrap { pub wrap_at_text_width: Option, } +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, Serialize, Deserialize)] +#[serde(default, rename_all = "kebab-case", deny_unknown_fields)] +pub struct WordCompletion { + pub enable: Option, + pub trigger_length: Option, +} + fn deserialize_regex<'de, D>(deserializer: D) -> Result, D::Error> where D: serde::Deserializer<'de>, diff --git a/helix-core/src/transaction.rs b/helix-core/src/transaction.rs index 450b47365ecb..83945f765f0e 100644 --- a/helix-core/src/transaction.rs +++ b/helix-core/src/transaction.rs @@ -19,6 +19,17 @@ pub enum Operation { Insert(Tendril), } +impl Operation { + /// The number of characters affected by the operation. + #[allow(clippy::len_without_is_empty)] + pub fn len(&self) -> usize { + match self { + Self::Retain(n) | Self::Delete(n) => *n, + Self::Insert(s) => s.chars().count(), + } + } +} + #[derive(Debug, Copy, Clone, PartialEq, Eq)] pub enum Assoc { Before, diff --git a/helix-term/src/handlers.rs b/helix-term/src/handlers.rs index c7d71526c802..9c46a6503f54 100644 --- a/helix-term/src/handlers.rs +++ b/helix-term/src/handlers.rs @@ -8,7 +8,7 @@ use crate::events; use crate::handlers::auto_save::AutoSaveHandler; use crate::handlers::signature_help::SignatureHelpHandler; -pub use helix_view::handlers::Handlers; +pub use helix_view::handlers::{word_index, Handlers}; use self::document_colors::DocumentColorsHandler; @@ -26,12 +26,14 @@ pub fn setup(config: Arc>) -> Handlers { let signature_hints = SignatureHelpHandler::new().spawn(); let auto_save = AutoSaveHandler::new().spawn(); let document_colors = DocumentColorsHandler::default().spawn(); + let word_index = word_index::Handler::spawn(); let handlers = Handlers { completions: helix_view::handlers::completion::CompletionHandler::new(event_tx), signature_hints, auto_save, document_colors, + word_index, }; helix_view::handlers::register_hooks(&handlers); diff --git a/helix-term/src/handlers/completion.rs b/helix-term/src/handlers/completion.rs index 5017399bd078..22bb0ce08175 100644 --- a/helix-term/src/handlers/completion.rs +++ b/helix-term/src/handlers/completion.rs @@ -30,6 +30,7 @@ mod item; mod path; mod request; mod resolve; +mod word; async fn handle_response( requests: &mut JoinSet, @@ -82,7 +83,7 @@ async fn replace_completions( fn show_completion( editor: &mut Editor, compositor: &mut Compositor, - items: Vec, + mut items: Vec, context: HashMap, trigger: Trigger, ) { @@ -101,6 +102,7 @@ fn show_completion( if ui.completion.is_some() { return; } + word::retain_valid_completions(trigger, doc, view.id, &mut items); editor.handlers.completions.active_completions = context; let completion_area = ui.set_completion(editor, items, trigger.pos, size); diff --git a/helix-term/src/handlers/completion/request.rs b/helix-term/src/handlers/completion/request.rs index 51a3129a8498..29cd8e42c322 100644 --- a/helix-term/src/handlers/completion/request.rs +++ b/helix-term/src/handlers/completion/request.rs @@ -28,6 +28,8 @@ use crate::job::{dispatch, dispatch_blocking}; use crate::ui; use crate::ui::editor::InsertEvent; +use super::word; + #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub(super) enum TriggerKind { Auto, @@ -242,10 +244,15 @@ fn request_completions( doc.selection(view.id).clone(), doc, handle.clone(), - savepoint, + savepoint.clone(), ) { requests.spawn_blocking(path_completion_request); } + if let Some(word_completion_request) = + word::completion(editor, trigger, handle.clone(), savepoint) + { + requests.spawn_blocking(word_completion_request); + } let ui = compositor.find::().unwrap(); ui.last_insert.1.push(InsertEvent::RequestCompletion); diff --git a/helix-term/src/handlers/completion/word.rs b/helix-term/src/handlers/completion/word.rs new file mode 100644 index 000000000000..aa204bf8f73d --- /dev/null +++ b/helix-term/src/handlers/completion/word.rs @@ -0,0 +1,134 @@ +use std::{borrow::Cow, sync::Arc}; + +use helix_core::{ + self as core, chars::char_is_word, completion::CompletionProvider, movement, Transaction, +}; +use helix_event::TaskHandle; +use helix_stdx::rope::RopeSliceExt as _; +use helix_view::{ + document::SavePoint, handlers::completion::ResponseContext, Document, Editor, ViewId, +}; + +use super::{request::TriggerKind, CompletionItem, CompletionItems, CompletionResponse, Trigger}; + +const COMPLETION_KIND: &str = "word"; + +pub(super) fn completion( + editor: &Editor, + trigger: Trigger, + handle: TaskHandle, + savepoint: Arc, +) -> Option CompletionResponse> { + if !doc!(editor).word_completion_enabled() { + return None; + } + let config = editor.config().word_completion; + let doc_config = doc!(editor) + .language_config() + .and_then(|config| config.word_completion); + let trigger_length = doc_config + .and_then(|c| c.trigger_length) + .unwrap_or(config.trigger_length) + .get() as usize; + + let (view, doc) = current_ref!(editor); + let rope = doc.text().clone(); + let word_index = editor.handlers.word_index().clone(); + let text = doc.text().slice(..); + let selection = doc.selection(view.id).clone(); + let pos = selection.primary().cursor(text); + + let cursor = movement::move_prev_word_start(text, core::Range::point(pos), 1); + if cursor.head == pos { + return None; + } + if trigger.kind != TriggerKind::Manual + && text + .slice(cursor.head..) + .graphemes() + .take(trigger_length) + .take_while(|g| g.chars().all(char_is_word)) + .count() + != trigger_length + { + return None; + } + + let typed_word_range = cursor.head..pos; + let typed_word = text.slice(typed_word_range.clone()); + let edit_diff = if typed_word + .char(typed_word.len_chars().saturating_sub(1)) + .is_whitespace() + { + 0 + } else { + typed_word.len_chars() + }; + + if handle.is_canceled() { + return None; + } + + let future = move || { + let text = rope.slice(..); + let typed_word: Cow<_> = text.slice(typed_word_range).into(); + let items = word_index + .matches(&typed_word) + .into_iter() + .filter(|word| word.as_str() != typed_word.as_ref()) + .map(|word| { + let transaction = Transaction::change_by_selection(&rope, &selection, |range| { + let cursor = range.cursor(text); + (cursor - edit_diff, cursor, Some((&word).into())) + }); + CompletionItem::Other(core::CompletionItem { + transaction, + label: word.into(), + kind: Cow::Borrowed(COMPLETION_KIND), + documentation: None, + provider: CompletionProvider::Word, + }) + }) + .collect(); + + CompletionResponse { + items: CompletionItems::Other(items), + provider: CompletionProvider::Word, + context: ResponseContext { + is_incomplete: false, + priority: 0, + savepoint, + }, + } + }; + + Some(future) +} + +pub(super) fn retain_valid_completions( + trigger: Trigger, + doc: &Document, + view_id: ViewId, + items: &mut Vec, +) { + if trigger.kind == TriggerKind::Manual { + return; + } + + let text = doc.text().slice(..); + let cursor = doc.selection(view_id).primary().cursor(text); + if text + .get_char(cursor.saturating_sub(1)) + .is_some_and(|ch| ch.is_whitespace()) + { + items.retain(|item| { + !matches!( + item, + CompletionItem::Other(core::CompletionItem { + provider: CompletionProvider::Word, + .. + }) + ) + }); + } +} diff --git a/helix-view/src/document.rs b/helix-view/src/document.rs index fb89e2e0ce9f..18c68948dd5c 100644 --- a/helix-view/src/document.rs +++ b/helix-view/src/document.rs @@ -1809,6 +1809,12 @@ impl Document { self.version } + pub fn word_completion_enabled(&self) -> bool { + self.language_config() + .and_then(|lang_config| lang_config.word_completion.and_then(|c| c.enable)) + .unwrap_or_else(|| self.config.load().word_completion.enable) + } + pub fn path_completion_enabled(&self) -> bool { self.language_config() .and_then(|lang_config| lang_config.path_completion) diff --git a/helix-view/src/editor.rs b/helix-view/src/editor.rs index 9aa073fcf516..6441f236f226 100644 --- a/helix-view/src/editor.rs +++ b/helix-view/src/editor.rs @@ -29,7 +29,7 @@ use std::{ collections::{BTreeMap, HashMap, HashSet}, fs, io::{self, stdin}, - num::NonZeroUsize, + num::{NonZeroU8, NonZeroUsize}, path::{Path, PathBuf}, pin::Pin, sync::Arc, @@ -279,6 +279,9 @@ pub struct Config { /// either absolute or relative to the current opened document or current working directory (if the buffer is not yet saved). /// Defaults to true. pub path_completion: bool, + /// Configures completion of words from open buffers. + /// Defaults to enabled with a trigger length of 7. + pub word_completion: WordCompletion, /// Automatic formatting on save. Defaults to true. pub auto_format: bool, /// Default register used for yank/paste. Defaults to '"' @@ -964,6 +967,22 @@ pub enum PopupBorderConfig { Menu, } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(default, rename_all = "kebab-case", deny_unknown_fields)] +pub struct WordCompletion { + pub enable: bool, + pub trigger_length: NonZeroU8, +} + +impl Default for WordCompletion { + fn default() -> Self { + Self { + enable: true, + trigger_length: NonZeroU8::new(7).unwrap(), + } + } +} + impl Default for Config { fn default() -> Self { Self { @@ -983,6 +1002,7 @@ impl Default for Config { auto_pairs: AutoPairConfig::default(), auto_completion: true, path_completion: true, + word_completion: WordCompletion::default(), auto_format: true, default_yank_register: '"', auto_save: AutoSave::default(), diff --git a/helix-view/src/handlers.rs b/helix-view/src/handlers.rs index 258ed89e5cec..6aba17d6dbe1 100644 --- a/helix-view/src/handlers.rs +++ b/helix-view/src/handlers.rs @@ -9,6 +9,7 @@ pub mod completion; pub mod dap; pub mod diagnostics; pub mod lsp; +pub mod word_index; #[derive(Debug)] pub enum AutoSaveEvent { @@ -22,6 +23,7 @@ pub struct Handlers { pub signature_hints: Sender, pub auto_save: Sender, pub document_colors: Sender, + pub word_index: word_index::Handler, } impl Handlers { @@ -46,8 +48,13 @@ impl Handlers { }; send_blocking(&self.signature_hints, event) } + + pub fn word_index(&self) -> &word_index::WordIndex { + &self.word_index.index + } } pub fn register_hooks(handlers: &Handlers) { lsp::register_hooks(handlers); + word_index::register_hooks(handlers); } diff --git a/helix-view/src/handlers/word_index.rs b/helix-view/src/handlers/word_index.rs new file mode 100644 index 000000000000..f84e77e28fda --- /dev/null +++ b/helix-view/src/handlers/word_index.rs @@ -0,0 +1,477 @@ +//! Indexing of words from open buffers. +//! +//! This provides an eventually consistent set of words used in any open buffers. This set is +//! later used for lexical completion. + +use std::{borrow::Cow, collections::HashMap, iter, mem, sync::Arc, time::Duration}; + +use helix_core::{ + chars::char_is_word, fuzzy::fuzzy_match, movement, ChangeSet, Range, Rope, RopeSlice, +}; +use helix_event::{register_hook, AsyncHook}; +use helix_stdx::rope::RopeSliceExt as _; +use parking_lot::RwLock; +use tokio::{sync::mpsc, time::Instant}; + +use crate::{ + events::{DocumentDidChange, DocumentDidClose, DocumentDidOpen}, + DocumentId, +}; + +use super::Handlers; + +#[derive(Debug)] +struct Change { + old_text: Rope, + text: Rope, + changes: ChangeSet, +} + +#[derive(Debug)] +enum Event { + Insert(Rope), + Update(DocumentId, Change), + Delete(DocumentId, Rope), +} + +#[derive(Debug)] +pub struct Handler { + pub(super) index: WordIndex, + /// A sender into an async hook which debounces updates to the index. + hook: mpsc::Sender, + /// A sender to a tokio task which coordinates the indexing of documents. + /// + /// See [WordIndex::run]. A supervisor-like task is in charge of spawning tasks to update the + /// index. This ensures that consecutive edits to a document trigger the correct order of + /// insertions and deletions into the word set. + coordinator: mpsc::UnboundedSender, +} + +impl Handler { + pub fn spawn() -> Self { + let index = WordIndex::default(); + let (tx, rx) = mpsc::unbounded_channel(); + tokio::spawn(index.clone().run(rx)); + Self { + hook: Hook { + changes: HashMap::default(), + coordinator: tx.clone(), + } + .spawn(), + index, + coordinator: tx, + } + } +} + +#[derive(Debug)] +struct Hook { + changes: HashMap, + coordinator: mpsc::UnboundedSender, +} + +const DEBOUNCE: Duration = Duration::from_secs(1); + +impl AsyncHook for Hook { + type Event = Event; + + fn handle_event(&mut self, event: Self::Event, timeout: Option) -> Option { + match event { + Event::Insert(_) => unreachable!("inserts are sent to the worker directly"), + Event::Update(doc, change) => { + if let Some(pending_change) = self.changes.get_mut(&doc) { + // If there is already a change waiting for this document, merge the two + // changes together by composing the changesets and saving the new `text`. + pending_change.changes = + mem::take(&mut pending_change.changes).compose(change.changes); + pending_change.text = change.text; + Some(Instant::now() + DEBOUNCE) + } else if !is_changeset_significant(&change.changes) { + // If the changeset is fairly large, debounce before updating the index. + self.changes.insert(doc, change); + Some(Instant::now() + DEBOUNCE) + } else { + // Otherwise if the change is small, queue the update to the index immediately. + self.coordinator.send(Event::Update(doc, change)).unwrap(); + timeout + } + } + Event::Delete(doc, text) => { + // If there are pending changes that haven't been indexed since the last debounce, + // forget them and delete the old text. + if let Some(change) = self.changes.remove(&doc) { + self.coordinator + .send(Event::Delete(doc, change.old_text)) + .unwrap(); + } else { + self.coordinator.send(Event::Delete(doc, text)).unwrap(); + } + timeout + } + } + } + + fn finish_debounce(&mut self) { + for (doc, change) in self.changes.drain() { + self.coordinator.send(Event::Update(doc, change)).unwrap(); + } + } +} + +/// Minimum number of grapheme clusters required to include a word in the index +const MIN_WORD_GRAPHEMES: usize = 3; +/// Maximum word length allowed (in chars) +const MAX_WORD_LEN: usize = 50; + +// TODO: choose or create a suitable small string type. +type Word = String; + +#[derive(Debug, Default)] +struct WordIndexInner { + /// Reference counted storage for words. + /// + /// Words are very likely to be reused many times. Instead of storing duplicates we keep a + /// reference count of times a word is used. When the reference count drops to zero the word + /// is removed from the index. + words: HashMap, +} + +impl WordIndexInner { + fn words(&self) -> impl Iterator { + self.words.keys() + } + + fn insert(&mut self, word: RopeSlice) { + let word: Cow = word.into(); + if let Some(rc) = self.words.get_mut(word.as_ref()) { + *rc = rc.saturating_add(1); + } else { + self.words.insert(word.into_owned(), 1); + } + } + + fn remove(&mut self, word: RopeSlice) { + let word: Cow = word.into(); + match self.words.get_mut(word.as_ref()) { + Some(1) => { + self.words.remove(word.as_ref()); + } + Some(n) => *n -= 1, + None => (), + } + } +} + +#[derive(Debug, Default, Clone)] +pub struct WordIndex { + inner: Arc>, +} + +impl WordIndex { + pub fn matches(&self, pattern: &str) -> Vec { + let inner = self.inner.read(); + let mut matches = fuzzy_match(pattern, inner.words(), false); + matches.sort_unstable_by_key(|(_, score)| *score); + matches.into_iter().map(|(word, _)| word.clone()).collect() + } + + fn add_document(&self, text: &Rope) { + let words: Vec<_> = words(text.slice(..)).collect(); + let mut inner = self.inner.write(); + for word in words { + inner.insert(word); + } + } + + fn update_document(&self, old_text: &Rope, text: &Rope, changes: &ChangeSet) { + let mut inserted = Vec::new(); + let mut removed = Vec::new(); + for (old_window, new_window) in changed_windows(old_text.slice(..), text.slice(..), changes) + { + inserted.extend(words(new_window)); + removed.extend(words(old_window)); + } + + let mut inner = self.inner.write(); + for word in inserted { + inner.insert(word); + } + for word in removed { + inner.remove(word); + } + } + + fn remove_document(&self, text: &Rope) { + let words: Vec<_> = words(text.slice(..)).collect(); + let mut inner = self.inner.write(); + for word in words { + inner.remove(word); + } + } + + /// Coordinate the indexing of documents. + /// + /// This task wraps a MPSC queue and spawns blocking tasks which update the index. Updates + /// are applied one-by-one to ensure that changes to the index are **serialized**: + /// updates to each document must be applied in-order. + async fn run(self, mut events: mpsc::UnboundedReceiver) { + while let Some(event) = events.recv().await { + let this = self.clone(); + tokio::task::spawn_blocking(move || match event { + Event::Insert(text) => { + this.add_document(&text); + } + Event::Update( + _doc, + Change { + old_text, + text, + changes, + .. + }, + ) => { + this.update_document(&old_text, &text, &changes); + } + Event::Delete(_doc, text) => { + this.remove_document(&text); + } + }) + .await + .unwrap(); + } + } +} + +fn words(text: RopeSlice) -> impl Iterator { + let mut cursor = Range::point(0); + if text + .get_char(cursor.anchor) + .is_some_and(|ch| !ch.is_whitespace()) + { + let cursor_word_end = movement::move_next_word_end(text, cursor, 1); + if cursor_word_end.anchor == 0 { + cursor = cursor_word_end; + } + } + + iter::from_fn(move || { + while cursor.head <= text.len_chars() { + let mut word = None; + if text + .slice(..cursor.head) + .graphemes_rev() + .take(MIN_WORD_GRAPHEMES) + .take_while(|g| g.chars().all(char_is_word)) + .count() + == MIN_WORD_GRAPHEMES + { + cursor.anchor += text + .chars_at(cursor.anchor) + .take_while(|&c| !char_is_word(c)) + .count(); + let slice = cursor.slice(text); + if slice.len_chars() <= MAX_WORD_LEN { + word = Some(slice); + } + } + let head = cursor.head; + cursor = movement::move_next_word_end(text, cursor, 1); + if cursor.head == head { + cursor.head = usize::MAX; + } + if word.is_some() { + return word; + } + } + None + }) +} + +/// Finds areas of the old and new texts around each operation in `changes`. +/// +/// The window is larger than the changed area and can encompass multiple insert/delete operations +/// if they are grouped closely together. +/// +/// The ranges of the old and new text should usually be of different sizes. For example a +/// deletion of "foo" surrounded by large retain sections would give a longer window into the +/// `old_text` and shorter window of `new_text`. Vice-versa for an insertion. A full replacement +/// of a word though would give two slices of the same size. +fn changed_windows<'a>( + old_text: RopeSlice<'a>, + new_text: RopeSlice<'a>, + changes: &'a ChangeSet, +) -> impl Iterator, RopeSlice<'a>)> { + use helix_core::Operation::*; + + let mut operations = changes.changes().iter().peekable(); + let mut old_pos = 0; + let mut new_pos = 0; + iter::from_fn(move || loop { + let operation = operations.next()?; + let old_start = old_pos; + let new_start = new_pos; + let len = operation.len(); + match operation { + Retain(_) => { + old_pos += len; + new_pos += len; + continue; + } + Insert(_) => new_pos += len, + Delete(_) => old_pos += len, + } + + // Scan ahead until a `Retain` is found which would end a window. + while let Some(o) = operations.next_if(|op| !matches!(op, Retain(n) if *n > MAX_WORD_LEN)) { + let len = o.len(); + match o { + Retain(_) => { + old_pos += len; + new_pos += len; + } + Delete(_) => old_pos += len, + Insert(_) => new_pos += len, + } + } + + let old_window = old_start.saturating_sub(MAX_WORD_LEN) + ..(old_pos + MAX_WORD_LEN).min(old_text.len_chars()); + let new_window = new_start.saturating_sub(MAX_WORD_LEN) + ..(new_pos + MAX_WORD_LEN).min(new_text.len_chars()); + + return Some((old_text.slice(old_window), new_text.slice(new_window))); + }) +} + +/// Estimates whether a changeset is significant or small. +fn is_changeset_significant(changes: &ChangeSet) -> bool { + use helix_core::Operation::*; + + let mut diff = 0; + for operation in changes.changes() { + match operation { + Retain(_) => continue, + Delete(_) | Insert(_) => diff += operation.len(), + } + } + + // This is arbitrary and could be tuned further: + diff > 1_000 +} + +pub(crate) fn register_hooks(handlers: &Handlers) { + let coordinator = handlers.word_index.coordinator.clone(); + register_hook!(move |event: &mut DocumentDidOpen<'_>| { + let doc = doc!(event.editor, &event.doc); + if doc.word_completion_enabled() { + coordinator.send(Event::Insert(doc.text().clone())).unwrap(); + } + Ok(()) + }); + + let tx = handlers.word_index.hook.clone(); + register_hook!(move |event: &mut DocumentDidChange<'_>| { + if !event.ghost_transaction && event.doc.word_completion_enabled() { + helix_event::send_blocking( + &tx, + Event::Update( + event.doc.id(), + Change { + old_text: event.old_text.clone(), + text: event.doc.text().clone(), + changes: event.changes.clone(), + }, + ), + ); + } + Ok(()) + }); + + let tx = handlers.word_index.hook.clone(); + register_hook!(move |event: &mut DocumentDidClose<'_>| { + if event.doc.word_completion_enabled() { + helix_event::send_blocking( + &tx, + Event::Delete(event.doc.id(), event.doc.text().clone()), + ); + } + Ok(()) + }); +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + + use super::*; + use helix_core::diff::compare_ropes; + + impl WordIndex { + fn words(&self) -> HashSet { + let inner = self.inner.read(); + inner.words().cloned().collect() + } + } + + #[track_caller] + fn assert_words>(text: &str, expected: T) { + let text = Rope::from_str(text); + let index = WordIndex::default(); + index.add_document(&text); + let actual = index.words(); + let expected: HashSet<_> = expected.into_iter().map(|i| i.to_string()).collect(); + assert_eq!(expected, actual); + } + + #[test] + fn parse() { + assert_words("one two three", ["one", "two", "three"]); + assert_words("a foo c", ["foo"]); + } + + #[track_caller] + fn assert_diff(before: &str, after: &str, expect_removed: R, expect_inserted: I) + where + S: ToString, + R: IntoIterator, + I: IntoIterator, + { + let before = Rope::from_str(before); + let after = Rope::from_str(after); + let diff = compare_ropes(&before, &after); + let expect_removed: HashSet<_> = + expect_removed.into_iter().map(|i| i.to_string()).collect(); + let expect_inserted: HashSet<_> = + expect_inserted.into_iter().map(|i| i.to_string()).collect(); + + let index = WordIndex::default(); + index.add_document(&before); + let words_before = index.words(); + index.update_document(&before, &after, diff.changes()); + let words_after = index.words(); + + let actual_removed = words_before.difference(&words_after).cloned().collect(); + let actual_inserted = words_after.difference(&words_before).cloned().collect(); + + eprintln!("\"{before}\" {words_before:?} => \"{after}\" {words_after:?}"); + assert_eq!( + expect_removed, actual_removed, + "expected {expect_removed:?} to be removed, instead {actual_removed:?} was" + ); + assert_eq!( + expect_inserted, actual_inserted, + "expected {expect_inserted:?} to be inserted, instead {actual_inserted:?} was" + ); + } + + #[test] + fn diff() { + assert_diff("one two three", "one five three", ["two"], ["five"]); + assert_diff("one two three", "one to three", ["two"], []); + assert_diff("one two three", "one three", ["two"], []); + assert_diff("one two three", "one t{o three", ["two"], []); + assert_diff("one foo three", "one fooo three", ["foo"], ["fooo"]); + + // TODO: further testing. Consider setting the max word size smaller in tests. + } +} From 91d2725db1b06024ec4a91c2d7a1362a74108034 Mon Sep 17 00:00:00 2001 From: Michael Davis Date: Tue, 25 Mar 2025 13:26:34 -0400 Subject: [PATCH 2/2] Add a very small owned string type for the word index `TinyBoxedStr` is a small-string optimized replacement for `Box` styled after . A nearly identical type is used in helix-editor/spellbook for space savings. This type is specialized for its use-case: * strings are immutable after creation * strings are very very small (less than 256 bytes long) Because of these attributes we can use nearly the full size of the type for the inline representation and also keep a small total size. Many other small string crates in the wild are 3 `usize`s long (same as a regular `String`) to support mutability. This type is more like a `Box` (2 `usize`s long). Mostly I like this small string type though because it's very straightforward to implement. Other than a few functions that reach into `std::alloc` or `std::ptr`, the code is short and boring. --- helix-stdx/src/lib.rs | 1 + helix-stdx/src/str.rs | 288 ++++++++++++++++++++++++++ helix-view/src/handlers/word_index.rs | 17 +- 3 files changed, 301 insertions(+), 5 deletions(-) create mode 100644 helix-stdx/src/str.rs diff --git a/helix-stdx/src/lib.rs b/helix-stdx/src/lib.rs index d09df587a63d..93cc8304a011 100644 --- a/helix-stdx/src/lib.rs +++ b/helix-stdx/src/lib.rs @@ -3,5 +3,6 @@ pub mod faccess; pub mod path; pub mod range; pub mod rope; +pub mod str; pub use range::Range; diff --git a/helix-stdx/src/str.rs b/helix-stdx/src/str.rs new file mode 100644 index 000000000000..967102a8ac44 --- /dev/null +++ b/helix-stdx/src/str.rs @@ -0,0 +1,288 @@ +//! Utilities for working with strings and specialized string types. + +use std::{ + alloc, + borrow::{Borrow, Cow}, + fmt, hash, + mem::{size_of, ManuallyDrop}, + ptr::{self, NonNull}, + slice, str, +}; + +/// A very very small owned string type. +/// +/// This type is like a `Box` and is similarly two `usize`s large. It can only fit strings +/// with a byte length smaller than 256. On 64-bit machines this type stores up to 15 bytes inline +/// (7 bytes on 32-bit machines). One byte is used to store the length. For strings short enough +/// to be stored inline, the remaining 15 (or 7) bytes store the content inline. Otherwise the +/// second `usize` of memory is a thin pointer to the string content. +/// +/// Unlike `Box` this type is not null-pointer optimized. +#[repr(C)] +pub struct TinyBoxedStr { + len: u8, + prefix: [u8; Self::PREFIX_LEN], + trailing: TinyBoxedStrTrailing, +} + +#[repr(C)] +union TinyBoxedStrTrailing { + suffix: [u8; TinyBoxedStr::SUFFIX_LEN], + ptr: ManuallyDrop>, +} + +impl TinyBoxedStr { + // 1 usize minus the byte to store the length. + const PREFIX_LEN: usize = size_of::() - size_of::(); + // The other `usize` is a pointer or the end parts of an inline string. + const SUFFIX_LEN: usize = size_of::(); + // ... for a grand total of 15 bytes for 64-bit machines or 7 for 32-bit. + const INLINE_LEN: u8 = (Self::PREFIX_LEN + Self::SUFFIX_LEN) as u8; + + pub const MAX_LEN: usize = u8::MAX as usize; + + #[inline] + pub fn len(&self) -> usize { + self.len as usize + } + + #[inline] + pub fn is_empty(&self) -> bool { + self.len == 0 + } + + pub fn as_bytes(&self) -> &[u8] { + let ptr = if self.len <= Self::INLINE_LEN { + let ptr = ptr::from_ref(self); + unsafe { ptr::addr_of!((*ptr).prefix) }.cast() + } else { + unsafe { self.trailing.ptr }.as_ptr() + }; + unsafe { slice::from_raw_parts(ptr, self.len()) } + } + + #[inline] + pub fn as_str(&self) -> &str { + unsafe { str::from_utf8_unchecked(self.as_bytes()) } + } + + /// Exposes the bytes as a mutable slice. + /// + /// When a string is short enough to be inline, this slice points to the `prefix` and `suffix` + /// parts of the struct. Otherwise the slice wraps the pointer to the allocation. + /// + /// SAFETY: As such, if the string is allocated then it is the caller's responsibility to + /// ensure that any modifications made to `&s.as_bytes_mut[..Self::PREFIX_LEN]` are written + /// to `s.prefix` as well if the string is allocated. + /// + /// SAFETY: It is also the caller's responsibility to ensure that edits to the bytes do not + /// make the bytes invalid UTF-8. + unsafe fn as_bytes_mut(&mut self) -> &mut [u8] { + let ptr = if self.len <= Self::INLINE_LEN { + let ptr = ptr::from_mut(self); + unsafe { ptr::addr_of_mut!((*ptr).prefix) }.cast() + } else { + unsafe { self.trailing.ptr }.as_ptr() + }; + unsafe { slice::from_raw_parts_mut(ptr, self.len()) } + } + + fn layout(len: u8) -> alloc::Layout { + alloc::Layout::array::(len as usize) + .expect("a valid layout for an array") + .pad_to_align() + } + + /// Creates a new `TinyBoxedStr` of the given length with all bytes zeroed. + /// + /// While this is used to create uninitialized strings which are later filled, note that the + /// zero byte is valid UTF-8 so the zeroed representation is always valid. + fn zeroed(len: u8) -> Self { + let trailing = if len <= Self::INLINE_LEN { + TinyBoxedStrTrailing { + suffix: [0; Self::SUFFIX_LEN], + } + } else { + let layout = Self::layout(len); + let nullable = unsafe { alloc::alloc_zeroed(layout) }; + let Some(ptr) = NonNull::new(nullable) else { + alloc::handle_alloc_error(layout); + }; + TinyBoxedStrTrailing { + ptr: ManuallyDrop::new(ptr), + } + }; + Self { + len, + prefix: [0; Self::PREFIX_LEN], + trailing, + } + } +} + +#[derive(Debug)] +pub struct TooLongError; + +impl fmt::Display for TooLongError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str("string was too long to be stored as a `TinyBoxedStr` (max 256 bytes)") + } +} + +impl std::error::Error for TooLongError {} + +impl TryFrom<&str> for TinyBoxedStr { + type Error = TooLongError; + + fn try_from(s: &str) -> Result { + if s.len() > Self::MAX_LEN { + return Err(TooLongError); + } + + let mut this = Self::zeroed(s.len() as u8); + // SAFETY: if `s` is valid UTF-8, `this`'s bytes will be valid UTF-8. + unsafe { this.as_bytes_mut() }.copy_from_slice(s.as_bytes()); + if this.len > Self::INLINE_LEN { + this.prefix + .copy_from_slice(&s.as_bytes()[..Self::PREFIX_LEN]); + } + Ok(this) + } +} + +// NOTE: converting from a `String` to a `TinyBoxedStr` is cheap when the string's length is equal +// to its capacity. +impl TryFrom for TinyBoxedStr { + type Error = TooLongError; + + fn try_from(s: String) -> Result { + // Inline strings must be cloned. It's a constant number of bytes to copy though. + if s.len() <= Self::INLINE_LEN as usize { + return s.as_str().try_into(); + } + + // Otherwise we can sometimes steal the `String`'s allocation if the string is allocated + // exactly (i.e. `s.len() == s.capacity()`). A `Box` is defined as being allocated + // exactly so we first convert to `Box` (which will reallocate if the capacity is not + // the same as the length) and then steal its pointer. + + if s.len() > Self::MAX_LEN { + return Err(TooLongError); + } + + let len = s.len() as u8; + let mut prefix = [0; Self::PREFIX_LEN]; + prefix.copy_from_slice(&s.as_bytes()[..Self::PREFIX_LEN]); + let ptr = Box::into_raw(s.into_boxed_str()).cast::(); + // SAFETY: `Box::into_raw` docs guarantee non-null. + let ptr = ManuallyDrop::new(unsafe { NonNull::new_unchecked(ptr) }); + let trailing = TinyBoxedStrTrailing { ptr }; + + Ok(Self { + len, + prefix, + trailing, + }) + } +} + +impl TryFrom> for TinyBoxedStr { + type Error = TooLongError; + + fn try_from(s: Cow<'_, str>) -> Result { + match s { + Cow::Borrowed(s) => s.try_into(), + Cow::Owned(s) => s.try_into(), + } + } +} + +impl TryFrom> for TinyBoxedStr { + type Error = TooLongError; + + fn try_from(slice: ropey::RopeSlice<'_>) -> Result { + // `impl From for String` uses `String::with_capacity` so we can reuse its + // allocation whenever it allocates `slice.len_bytes()`. + let s: Cow = slice.into(); + s.try_into() + } +} + +impl Drop for TinyBoxedStr { + fn drop(&mut self) { + if self.len > Self::INLINE_LEN { + let ptr = unsafe { self.trailing.ptr }.as_ptr(); + let layout = Self::layout(self.len); + unsafe { alloc::dealloc(ptr, layout) } + } + } +} + +impl Clone for TinyBoxedStr { + fn clone(&self) -> Self { + let mut this = Self::zeroed(self.len); + // SAFETY: if `self` is valid UTF-8 then `this` will be too. + unsafe { this.as_bytes_mut() }.copy_from_slice(self.as_bytes()); + if this.len > Self::INLINE_LEN { + this.prefix + .copy_from_slice(&self.as_bytes()[..Self::PREFIX_LEN]); + } + this + } +} + +impl Default for TinyBoxedStr { + fn default() -> Self { + Self::zeroed(0) + } +} + +impl AsRef for TinyBoxedStr { + fn as_ref(&self) -> &str { + self.as_str() + } +} + +impl Borrow for TinyBoxedStr { + fn borrow(&self) -> &str { + self.as_str() + } +} + +// NOTE: this could be specialized to optimize the number of comparison operations. We could cast +// the first `usize` of memory together to do a single comparison (and same for the suffixes). +// This optimization would only matter if we compared these strings very frequently however. +impl PartialEq for TinyBoxedStr { + fn eq(&self, other: &Self) -> bool { + self.as_str() == other.as_str() + } +} + +impl Eq for TinyBoxedStr {} + +impl PartialEq for TinyBoxedStr { + fn eq(&self, other: &str) -> bool { + self.as_str() == other + } +} + +impl hash::Hash for TinyBoxedStr { + fn hash(&self, state: &mut H) { + self.as_str().hash(state) + } +} + +impl fmt::Debug for TinyBoxedStr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.as_str().fmt(f) + } +} + +impl fmt::Display for TinyBoxedStr { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.as_str().fmt(f) + } +} + +unsafe impl Send for TinyBoxedStr {} +unsafe impl Sync for TinyBoxedStr {} diff --git a/helix-view/src/handlers/word_index.rs b/helix-view/src/handlers/word_index.rs index f84e77e28fda..61b65f700488 100644 --- a/helix-view/src/handlers/word_index.rs +++ b/helix-view/src/handlers/word_index.rs @@ -123,8 +123,7 @@ const MIN_WORD_GRAPHEMES: usize = 3; /// Maximum word length allowed (in chars) const MAX_WORD_LEN: usize = 50; -// TODO: choose or create a suitable small string type. -type Word = String; +type Word = helix_stdx::str::TinyBoxedStr; #[derive(Debug, Default)] struct WordIndexInner { @@ -142,11 +141,16 @@ impl WordIndexInner { } fn insert(&mut self, word: RopeSlice) { + assert!(word.len_chars() <= MAX_WORD_LEN); + // The word must be shorter than `TinyBoxedStr::MAX` because it is fewer than 50 + // characters and characters take at most four bytes. + assert!(word.len_bytes() < Word::MAX_LEN); + let word: Cow = word.into(); if let Some(rc) = self.words.get_mut(word.as_ref()) { *rc = rc.saturating_add(1); } else { - self.words.insert(word.into_owned(), 1); + self.words.insert(word.try_into().unwrap(), 1); } } @@ -172,7 +176,10 @@ impl WordIndex { let inner = self.inner.read(); let mut matches = fuzzy_match(pattern, inner.words(), false); matches.sort_unstable_by_key(|(_, score)| *score); - matches.into_iter().map(|(word, _)| word.clone()).collect() + matches + .into_iter() + .map(|(word, _)| word.to_string()) + .collect() } fn add_document(&self, text: &Rope) { @@ -409,7 +416,7 @@ mod tests { impl WordIndex { fn words(&self) -> HashSet { let inner = self.inner.read(); - inner.words().cloned().collect() + inner.words().map(|w| w.to_string()).collect() } }