diff --git a/Cargo.lock b/Cargo.lock index e897a1cd..220535a3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -875,4 +875,5 @@ dependencies = [ "rstest", "serde", "tempfile", + "unicode-segmentation", ] diff --git a/Cargo.toml b/Cargo.toml index 61c00d0f..a40a07b9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -27,6 +27,7 @@ glob = "0.3.0" ordered-float = "2.0.0" serde = { version = "1.0.116", features = ["derive"] } tempfile = "3.1.0" +unicode-segmentation = "1.8.0" [target.'cfg(windows)'.dependencies] rand = { version = "0.8.4", features = [ diff --git a/contrib/completions/_zoxide b/contrib/completions/_zoxide index 178a27d3..2fe0400d 100644 --- a/contrib/completions/_zoxide +++ b/contrib/completions/_zoxide @@ -68,8 +68,8 @@ _arguments "${_arguments_options[@]}" \ '(-l --list)--interactive[Use interactive selection]' \ '(-i --interactive)-l[List all matching directories]' \ '(-i --interactive)--list[List all matching directories]' \ -'(-i --interactive)-s[Print score with results]' \ -'(-i --interactive)--score[Print score with results]' \ +'(-i --interactive)-s[Print score with results (keyword match score, frequency score)]' \ +'(-i --interactive)--score[Print score with results (keyword match score, frequency score)]' \ '-h[Print help information]' \ '--help[Print help information]' \ '-V[Print version information]' \ diff --git a/contrib/completions/_zoxide.ps1 b/contrib/completions/_zoxide.ps1 index 72138e63..4a427e6a 100644 --- a/contrib/completions/_zoxide.ps1 +++ b/contrib/completions/_zoxide.ps1 @@ -64,8 +64,8 @@ Register-ArgumentCompleter -Native -CommandName 'zoxide' -ScriptBlock { [CompletionResult]::new('--interactive', 'interactive', [CompletionResultType]::ParameterName, 'Use interactive selection') [CompletionResult]::new('-l', 'l', [CompletionResultType]::ParameterName, 'List all matching directories') [CompletionResult]::new('--list', 'list', [CompletionResultType]::ParameterName, 'List all matching directories') - [CompletionResult]::new('-s', 's', [CompletionResultType]::ParameterName, 'Print score with results') - [CompletionResult]::new('--score', 'score', [CompletionResultType]::ParameterName, 'Print score with results') + [CompletionResult]::new('-s', 's', [CompletionResultType]::ParameterName, 'Print score with results (keyword match score, frequency score)') + [CompletionResult]::new('--score', 'score', [CompletionResultType]::ParameterName, 'Print score with results (keyword match score, frequency score)') [CompletionResult]::new('-h', 'h', [CompletionResultType]::ParameterName, 'Print help information') [CompletionResult]::new('--help', 'help', [CompletionResultType]::ParameterName, 'Print help information') [CompletionResult]::new('-V', 'V', [CompletionResultType]::ParameterName, 'Print version information') diff --git a/contrib/completions/zoxide.elv b/contrib/completions/zoxide.elv index dfdebc23..5783df10 100644 --- a/contrib/completions/zoxide.elv +++ b/contrib/completions/zoxide.elv @@ -58,8 +58,8 @@ set edit:completion:arg-completer[zoxide] = [@words]{ cand --interactive 'Use interactive selection' cand -l 'List all matching directories' cand --list 'List all matching directories' - cand -s 'Print score with results' - cand --score 'Print score with results' + cand -s 'Print score with results (keyword match score, frequency score)' + cand --score 'Print score with results (keyword match score, frequency score)' cand -h 'Print help information' cand --help 'Print help information' cand -V 'Print version information' diff --git a/contrib/completions/zoxide.fish b/contrib/completions/zoxide.fish index 1ca8db01..3d4a6077 100644 --- a/contrib/completions/zoxide.fish +++ b/contrib/completions/zoxide.fish @@ -20,7 +20,7 @@ complete -c zoxide -n "__fish_seen_subcommand_from query" -l exclude -d 'Exclude complete -c zoxide -n "__fish_seen_subcommand_from query" -l all -d 'Show deleted directories' complete -c zoxide -n "__fish_seen_subcommand_from query" -s i -l interactive -d 'Use interactive selection' complete -c zoxide -n "__fish_seen_subcommand_from query" -s l -l list -d 'List all matching directories' -complete -c zoxide -n "__fish_seen_subcommand_from query" -s s -l score -d 'Print score with results' +complete -c zoxide -n "__fish_seen_subcommand_from query" -s s -l score -d 'Print score with results (keyword match score, frequency score)' complete -c zoxide -n "__fish_seen_subcommand_from query" -s h -l help -d 'Print help information' complete -c zoxide -n "__fish_seen_subcommand_from query" -s V -l version -d 'Print version information' complete -c zoxide -n "__fish_seen_subcommand_from remove" -s i -l interactive -r diff --git a/contrib/completions/zoxide.ts b/contrib/completions/zoxide.ts index 0c41a758..b86362a1 100644 --- a/contrib/completions/zoxide.ts +++ b/contrib/completions/zoxide.ts @@ -159,7 +159,7 @@ const completion: Fig.Spec = { }, { name: ["-s", "--score"], - description: "Print score with results", + description: "Print score with results (keyword match score, frequency score)", }, { name: ["-h", "--help"], diff --git a/src/app/_app.rs b/src/app/_app.rs index e40e3e74..941358b9 100644 --- a/src/app/_app.rs +++ b/src/app/_app.rs @@ -111,7 +111,7 @@ pub struct Query { #[clap(long, short, conflicts_with = "interactive")] pub list: bool, - /// Print score with results + /// Print score with results (keyword match score, frequency score) #[clap(long, short, conflicts_with = "interactive")] pub score: bool, diff --git a/src/app/query.rs b/src/app/query.rs index cb1de833..0fa5a725 100644 --- a/src/app/query.rs +++ b/src/app/query.rs @@ -30,17 +30,18 @@ impl Query { stream = stream.with_exclude(path); } + let mut stream = stream.into_iter(); if self.interactive { let mut fzf = Fzf::new(false)?; while let Some(dir) = stream.next() { - writeln!(fzf.stdin(), "{}", dir.display_score(now)).pipe_exit("fzf")?; + writeln!(fzf.stdin(), "{}", dir.display_score(now, Some(&self.keywords))).pipe_exit("fzf")?; } let selection = fzf.wait_select()?; if self.score { print!("{}", selection); } else { - let path = selection.get(5..).context("could not read selection from fzf")?; + let path = selection.get(10..).context("could not read selection from fzf")?; print!("{}", path); } } else if self.list { @@ -48,7 +49,7 @@ impl Query { let handle = &mut stdout.lock(); while let Some(dir) = stream.next() { if self.score { - writeln!(handle, "{}", dir.display_score(now)) + writeln!(handle, "{}", dir.display_score(now, Some(&self.keywords))) } else { writeln!(handle, "{}", dir.display()) } @@ -58,7 +59,7 @@ impl Query { } else { let dir = stream.next().context("no match found")?; if self.score { - writeln!(io::stdout(), "{}", dir.display_score(now)) + writeln!(io::stdout(), "{}", dir.display_score(now, Some(&self.keywords))) } else { writeln!(io::stdout(), "{}", dir.display()) } diff --git a/src/app/remove.rs b/src/app/remove.rs index 18334712..b3b90ff4 100644 --- a/src/app/remove.rs +++ b/src/app/remove.rs @@ -18,15 +18,15 @@ impl Run for Remove { match &self.interactive { Some(keywords) => { let now = util::current_time()?; - let mut stream = db.stream(now).with_keywords(keywords); + let mut stream = db.stream(now).with_keywords(keywords).into_iter(); let mut fzf = Fzf::new(true)?; while let Some(dir) = stream.next() { - writeln!(fzf.stdin(), "{}", dir.display_score(now)).pipe_exit("fzf")?; + writeln!(fzf.stdin(), "{}", dir.display_score(now, Some(keywords))).pipe_exit("fzf")?; } selection = fzf.wait_select()?; - let paths = selection.lines().filter_map(|line| line.get(5..)); + let paths = selection.lines().filter_map(|line| line.get(10..)); for path in paths { if !db.remove(path) { bail!("path not found in database: {}", path); diff --git a/src/db/dir.rs b/src/db/dir.rs index 1661a1fe..8ff819d0 100644 --- a/src/db/dir.rs +++ b/src/db/dir.rs @@ -1,12 +1,15 @@ use std::borrow::Cow; use std::fmt::{self, Display, Formatter}; use std::ops::{Deref, DerefMut}; +use std::path::PathBuf; +use std::str::FromStr; use anyhow::{bail, Context, Result}; use bincode::Options as _; use serde::{Deserialize, Serialize}; +use unicode_segmentation::UnicodeSegmentation; -#[derive(Debug, Deserialize, Serialize)] +#[derive(Debug, Deserialize, Serialize, Default)] pub struct DirList<'a>(#[serde(borrow)] pub Vec>); impl DirList<'_> { @@ -88,14 +91,14 @@ pub struct Dir<'a> { } impl Dir<'_> { - pub fn score(&self, now: Epoch) -> Rank { + pub fn score(&self, now: Epoch, keywords: &Vec) -> Score { const HOUR: Epoch = 60 * 60; const DAY: Epoch = 24 * HOUR; const WEEK: Epoch = 7 * DAY; // The older the entry, the lesser its importance. let duration = now.saturating_sub(self.last_accessed); - if duration < HOUR { + let adjusted_rank = if duration < HOUR { self.rank * 4.0 } else if duration < DAY { self.rank * 2.0 @@ -103,18 +106,146 @@ impl Dir<'_> { self.rank * 0.5 } else { self.rank * 0.25 + }; + + for keyword in keywords { + debug_assert!(self.path.to_lowercase().contains(&keyword.to_lowercase())); + } + + let mut kw_score_sum = 0; + + let smart_case = keywords.iter().all(|kw| &kw.to_lowercase() == kw); + + // Split the path into components, then words, so the "m" can be a better match + // for "folk music" than for "tom", and the best match for "music". + // And even more so if it's the last path component. + let path = PathBuf::from_str(&self.path).unwrap(); // safe because error is Infallible + let path_components = path.components(); + let mut is_last_component = true; + for component in path_components.rev() { + let component = component.as_os_str().to_str().unwrap(); // safe because the path came from a string + let component = if smart_case { component.to_lowercase() } else { component.to_owned() }; + + let left_word_boundaries = left_word_boundaries(&component); + for keyword in keywords { + kw_score_sum += + Self::compute_kw_score(&component, keyword, &left_word_boundaries, smart_case, is_last_component); + } + is_last_component = false; } + + (kw_score_sum, adjusted_rank) + } + + pub fn compute_kw_score( + path_component: &str, + keyword: &str, + left_word_boundaries: &Vec, + smart_case: bool, + is_last_component: bool, + ) -> u64 { + let keyword_lower = &keyword.to_lowercase(); + let path_lower = path_component.to_lowercase(); + + // more than one boundary can match + let mut best_boundary_score = 0; + for idx in left_word_boundaries { + // TODO: think carefully about these rules. Should the case of the match + // be allowed to influence the score? What if it's all lowercase, so + // a smart case match is impossible? + let word = &path_component[*idx..]; + let word_lower = &path_lower[*idx..]; + if word.starts_with(keyword) { + // exact match, but even better if it's at the leftmost position in the component, + // like "D" matching $HOME/Documents + let score = if *idx == 0 { 105 } else { 100 }; + + // TODO: think about checking the right word boundary, and give extra points if it matches. + // Imagine two directories, src_3 and src. If src_3 is more frequently used, "sr" will + // match src_3. But "src" will match src. + best_boundary_score = best_boundary_score.max(score); + } else if !smart_case && word_lower.starts_with(keyword) { + // smart case is off (a keyword has case), but this keyword alone would be a smart case match + // for the component. + best_boundary_score = best_boundary_score.max(25); + } else if word_lower.starts_with(keyword_lower) { + // wrong case but it's a match otherwise + best_boundary_score = best_boundary_score.max(20); + } else { + // No score. We don't need to give any score for a keyword that matches but not on a word boundary-- + // All paths being checked should at least match in that way. + // But note that though the path will match the keyword, this path component may not match. + } + } + + if best_boundary_score > 0 && is_last_component { + // matches in the last path component should be considered a little better + best_boundary_score += 5; + } + + best_boundary_score } pub fn display(&self) -> DirDisplay { DirDisplay { dir: self } } - pub fn display_score(&self, now: Epoch) -> DirDisplayScore { - DirDisplayScore { dir: self, now } + pub fn display_score(&self, now: Epoch, keywords: Option<&Vec>) -> DirDisplayScore { + DirDisplayScore { dir: self, now, keywords: keywords.map(|vec| vec.iter().cloned().collect()) } } } +/// Returns byte indices that correspond to the leftmost position of each word. +/// For input "hi there", the result will contain 0 and 3. +/// +/// The result may also contain extraneous indices. +fn left_word_boundaries(text: &str) -> Vec { + let mut boundaries = Vec::new(); + + #[derive(PartialEq, Clone, Copy, PartialOrd)] + enum Case { + None, + LowerCase, + UpperCase, + } + + // We won't need the words themselves because we want to do multi-word match. + // We need the whole string for that. + for (word_idx, word) in text.unicode_word_indices() { + boundaries.push(word_idx); + + // Also search for case changes, and non-text characters: + // MyDocuments + // my_documents + // TODO: should "clap3b4" count as 4 words or 1? + let mut prev_case = None; + for (grapheme_idx, grapheme) in word.grapheme_indices(true) { + let lower = grapheme.to_lowercase(); + let upper = grapheme.to_uppercase(); + let case = if lower == grapheme && upper == grapheme { + Case::None + } else if lower == grapheme { + Case::LowerCase + } else { + // Assume the other cases are upper case, because there might be more than + // one way to represent upper case + Case::UpperCase + }; + + if let Some(prev_case) = &prev_case { + if case > *prev_case { + // Consider this a word start if going from no case to any case, + // or lower case to upper case. + boundaries.push(word_idx + grapheme_idx); + } + } + let _ = prev_case.replace(case); + } + } + + boundaries +} + pub struct DirDisplay<'a> { dir: &'a Dir<'a>, } @@ -128,11 +259,15 @@ impl Display for DirDisplay<'_> { pub struct DirDisplayScore<'a> { dir: &'a Dir<'a>, now: Epoch, + keywords: Option>, } impl Display for DirDisplayScore<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - let score = self.dir.score(self.now); + let no_keywords = Vec::default(); + let keywords = self.keywords.as_ref().unwrap_or(&no_keywords); + + let (kw_score, score) = self.dir.score(self.now, keywords); let score = if score > 9999.0 { 9999 } else if score > 0.0 { @@ -140,18 +275,19 @@ impl Display for DirDisplayScore<'_> { } else { 0 }; - write!(f, "{:>4} {}", score, self.dir.path) + write!(f, "{:>4},{:>4} {}", kw_score, score, self.dir.path) } } pub type Rank = f64; +pub type Score = (u64, Rank); pub type Epoch = u64; #[cfg(test)] mod tests { - use std::borrow::Cow; + use std::{borrow::Cow, collections::HashSet}; - use super::{Dir, DirList}; + use super::{left_word_boundaries, Dir, DirList}; #[test] fn zero_copy() { @@ -164,4 +300,32 @@ mod tests { assert!(matches!(dir.path, Cow::Borrowed(_))) } } + + #[test] + fn test_left_word_boundaries() { + assert!(left_word_boundaries("") == vec![]); + assert!(left_word_boundaries("Hi") == vec![0]); + + assert!(vec![0, 3] + .into_iter() + .collect::>() + .is_subset(&left_word_boundaries("hi there").into_iter().collect())); + assert!(vec![0, 3] + .into_iter() + .collect::>() + .is_subset(&left_word_boundaries("hi_there").into_iter().collect())); + + assert!(vec![0, 4] == left_word_boundaries("FürElise")); + assert!(vec![0, 1] == left_word_boundaries("uTorrent")); + assert!(vec![0, 2] == left_word_boundaries("µTorrent")); + + assert!(vec![1, 6, 11] + .into_iter() + .collect::>() + .is_subset(&left_word_boundaries("/path/file.ext").into_iter().collect())); + assert!(vec![0, 3, 8, 13] + .into_iter() + .collect::>() + .is_subset(&left_word_boundaries(r"C:\path\file.ext").into_iter().collect())); + } } diff --git a/src/db/stream.rs b/src/db/stream.rs index e5d3eb98..0204134b 100644 --- a/src/db/stream.rs +++ b/src/db/stream.rs @@ -1,5 +1,3 @@ -use std::iter::Rev; -use std::ops::Range; use std::{fs, path}; use ordered_float::OrderedFloat; @@ -9,7 +7,6 @@ use crate::util; pub struct Stream<'db, 'file> { db: &'db mut Database<'file>, - idxs: Rev>, keywords: Vec, @@ -18,25 +15,22 @@ pub struct Stream<'db, 'file> { resolve_symlinks: bool, exclude_path: Option, + now: Epoch, } impl<'db, 'file> Stream<'db, 'file> { pub fn new(db: &'db mut Database<'file>, now: Epoch) -> Self { - // Iterate in descending order of score. - db.dirs.sort_unstable_by_key(|dir| OrderedFloat(dir.score(now))); - let idxs = (0..db.dirs.len()).rev(); - // If a directory is deleted and hasn't been used for 90 days, delete it from the database. let expire_below = now.saturating_sub(90 * 24 * 60 * 60); Stream { db, - idxs, keywords: Vec::new(), check_exists: false, expire_below, resolve_symlinks: false, exclude_path: None, + now, } } @@ -56,31 +50,22 @@ impl<'db, 'file> Stream<'db, 'file> { self } - pub fn next(&mut self) -> Option<&Dir<'file>> { - while let Some(idx) = self.idxs.next() { - let dir = &self.db.dirs[idx]; + pub fn into_iter(self) -> StreamIterator<'db, 'file> { + let mut idxs: Vec<_> = self.db.dirs.iter() + .enumerate() // store the original indices before filtering + .filter(|(_idx, dir)| + self.matches_keywords(&dir.path) && + Some(dir.path.as_ref()) != self.exclude_path.as_deref()) + .collect(); - if !self.matches_keywords(&dir.path) { - continue; - } - - if !self.matches_exists(&dir.path) { - if dir.last_accessed < self.expire_below { - self.db.dirs.swap_remove(idx); - self.db.modified = true; - } - continue; - } - - if Some(dir.path.as_ref()) == self.exclude_path.as_deref() { - continue; - } - - let dir = &self.db.dirs[idx]; - return Some(dir); - } + // Iterate in descending order of score. + idxs.sort_by_cached_key(|(_idx, dir)| { + let (kw_score, frequency_score) = dir.score(self.now, &self.keywords); + (kw_score, OrderedFloat(frequency_score)) + }); + let idxs = idxs.into_iter().map(|(idx, _)| idx).rev().collect::>().into_iter(); // copy the indices to avoid lifetime issues - None + StreamIterator { stream: self, idxs: Box::new(idxs) } } fn matches_exists>(&self, path: S) -> bool { @@ -120,6 +105,32 @@ impl<'db, 'file> Stream<'db, 'file> { } } +pub struct StreamIterator<'db, 'file> { + stream: Stream<'db, 'file>, + idxs: Box>, +} + +impl<'db, 'file> StreamIterator<'db, 'file> { + pub fn next(&mut self) -> Option<&Dir<'file>> { + while let Some(idx) = self.idxs.next() { + let dir = &self.stream.db.dirs[idx]; + + if !self.stream.matches_exists(&dir.path) { + if dir.last_accessed < self.stream.expire_below { + self.stream.db.dirs.swap_remove(idx); + self.stream.db.modified = true; + } + continue; + } + + let dir = &self.stream.db.dirs[idx]; + return Some(dir); + } + + None + } +} + #[cfg(test)] mod tests { use std::path::PathBuf;