diff --git a/Cargo.lock b/Cargo.lock index ee6aeb6f..aeef88f2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -725,4 +725,5 @@ dependencies = [ "rstest", "serde", "tempfile", + "unicode-segmentation", ] diff --git a/Cargo.toml b/Cargo.toml index e4873083..635ac16c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -20,6 +20,7 @@ glob = "0.3.0" ordered-float = "2.0.0" serde = { version = "1.0.116", features = ["derive"] } tempfile = "3.1.0" +unicode-segmentation = "1.8.0" [target.'cfg(windows)'.dependencies] rand = { version = "0.8.4", features = [ diff --git a/src/app/query.rs b/src/app/query.rs index b888b6a2..0fa5a725 100644 --- a/src/app/query.rs +++ b/src/app/query.rs @@ -34,7 +34,7 @@ impl Query { if self.interactive { let mut fzf = Fzf::new(false)?; while let Some(dir) = stream.next() { - writeln!(fzf.stdin(), "{}", dir.display_score(now)).pipe_exit("fzf")?; + writeln!(fzf.stdin(), "{}", dir.display_score(now, Some(&self.keywords))).pipe_exit("fzf")?; } let selection = fzf.wait_select()?; @@ -49,7 +49,7 @@ impl Query { let handle = &mut stdout.lock(); while let Some(dir) = stream.next() { if self.score { - writeln!(handle, "{}", dir.display_score(now)) + writeln!(handle, "{}", dir.display_score(now, Some(&self.keywords))) } else { writeln!(handle, "{}", dir.display()) } @@ -59,7 +59,7 @@ impl Query { } else { let dir = stream.next().context("no match found")?; if self.score { - writeln!(io::stdout(), "{}", dir.display_score(now)) + writeln!(io::stdout(), "{}", dir.display_score(now, Some(&self.keywords))) } else { writeln!(io::stdout(), "{}", dir.display()) } diff --git a/src/app/remove.rs b/src/app/remove.rs index 9175b2b8..b3b90ff4 100644 --- a/src/app/remove.rs +++ b/src/app/remove.rs @@ -22,7 +22,7 @@ impl Run for Remove { let mut fzf = Fzf::new(true)?; while let Some(dir) = stream.next() { - writeln!(fzf.stdin(), "{}", dir.display_score(now)).pipe_exit("fzf")?; + writeln!(fzf.stdin(), "{}", dir.display_score(now, Some(keywords))).pipe_exit("fzf")?; } selection = fzf.wait_select()?; diff --git a/src/db/dir.rs b/src/db/dir.rs index 1ef62af5..7cf6934e 100644 --- a/src/db/dir.rs +++ b/src/db/dir.rs @@ -5,6 +5,7 @@ use std::ops::{Deref, DerefMut}; use anyhow::{bail, Context, Result}; use bincode::Options as _; use serde::{Deserialize, Serialize}; +use unicode_segmentation::UnicodeSegmentation; #[derive(Debug, Deserialize, Serialize, Default)] pub struct DirList<'a>(#[serde(borrow)] pub Vec>); @@ -88,7 +89,7 @@ pub struct Dir<'a> { } impl Dir<'_> { - pub fn score(&self, now: Epoch, _keywords: &Vec) -> Score { + pub fn score(&self, now: Epoch, keywords: &Vec) -> Score { const HOUR: Epoch = 60 * 60; const DAY: Epoch = 24 * HOUR; const WEEK: Epoch = 7 * DAY; @@ -105,22 +106,111 @@ impl Dir<'_> { self.rank * 0.25 }; - // TODO: incorporate keywords into the scoring logic, so match quality is more significant - // than the access date. See issue #260. - let kw_score = 1; + let left_word_boundaries = left_word_boundaries(&self.path); - (kw_score, adjusted_rank) + let mut kw_score_sum = 0; + for keyword in keywords { + kw_score_sum += self.compute_kw_score(keyword, &left_word_boundaries); + } + + (kw_score_sum, adjusted_rank) + } + + pub fn compute_kw_score(&self, keyword: &str, left_word_boundaries: &Vec) -> u64 { + let keyword_lower = &keyword.to_lowercase(); + let path_lower = self.path.to_lowercase(); + + // more than one boundary can match + let mut best_boundary_score = 0; + for idx in left_word_boundaries { + // TODO: think carefully about these rules. Should the case of the match + // be allowed to influence the score? What if it's all lowercase, so + // a smart case match is impossible? + let path = &self.path[*idx..]; + let path_lower = &path_lower[*idx..]; + if path.starts_with(keyword) { + // exact match + + // TODO: think about checking the right word boundary, and give extra points if it matches. + // Imagine two directories, src_3 and src. If src_3 is more frequently used, "sr" will + // match src_3. But "src" will match src. + best_boundary_score = best_boundary_score.max(100); + } else if path_lower.starts_with(keyword) { + // smart case match + best_boundary_score = best_boundary_score.max(90); + } else if path_lower.starts_with(keyword_lower) { + // wrong case but it's a match otherwise + best_boundary_score = best_boundary_score.max(20); + } + + // We don't need to give any score for a keyword that matches but not on a word boundary-- + // All paths being checked should at least match in that way. + } + debug_assert!(path_lower.contains(keyword_lower)); + + best_boundary_score } pub fn display(&self) -> DirDisplay { DirDisplay { dir: self } } - pub fn display_score(&self, now: Epoch) -> DirDisplayScore { - DirDisplayScore { dir: self, now } + pub fn display_score(&self, now: Epoch, keywords: Option<&Vec>) -> DirDisplayScore { + DirDisplayScore { dir: self, now, keywords: keywords.map(|vec| vec.iter().cloned().collect()) } } } +/// Returns byte indices that correspond to the leftmost position of each word. +/// For input "hi there", the result will contain 0 and 3. +/// +/// The result may also contain extraneous indices. +fn left_word_boundaries(text: &str) -> Vec { + let mut boundaries = Vec::new(); + + #[derive(PartialEq, Clone, Copy, PartialOrd)] + enum Case { + None, + LowerCase, + UpperCase, + } + + // We won't need the words themselves because we want to do multi-word match. + // We need the whole string for that. + for (word_idx, word) in text.unicode_word_indices() { + boundaries.push(word_idx); + + // Also search for case changes, and non-text characters: + // MyDocuments + // my_documents + // TODO: should "clap3b4" count as 4 words or 1? + let mut prev_case = None; + for (grapheme_idx, grapheme) in word.grapheme_indices(true) { + let lower = grapheme.to_lowercase(); + let upper = grapheme.to_uppercase(); + let case = if lower == grapheme && upper == grapheme { + Case::None + } else if lower == grapheme { + Case::LowerCase + } else { + // Assume the other cases are upper case, because there might be more than + // one way to represent upper case + Case::UpperCase + }; + + if let Some(prev_case) = &prev_case { + if case > *prev_case { + // Consider this a word start if going from no case to any case, + // or lower case to upper case. + boundaries.push(word_idx + grapheme_idx); + } + } + let _ = prev_case.replace(case); + } + } + + boundaries +} + pub struct DirDisplay<'a> { dir: &'a Dir<'a>, } @@ -134,11 +224,15 @@ impl Display for DirDisplay<'_> { pub struct DirDisplayScore<'a> { dir: &'a Dir<'a>, now: Epoch, + keywords: Option>, } impl Display for DirDisplayScore<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { - let (kw_score, score) = self.dir.score(self.now, &vec![]); + let no_keywords = Vec::default(); + let keywords = self.keywords.as_ref().unwrap_or(&no_keywords); + + let (kw_score, score) = self.dir.score(self.now, keywords); let score = if score > 9999.0 { 9999 } else if score > 0.0 { @@ -156,9 +250,9 @@ pub type Epoch = u64; #[cfg(test)] mod tests { - use std::borrow::Cow; + use std::{borrow::Cow, collections::HashSet}; - use super::{Dir, DirList}; + use super::{left_word_boundaries, Dir, DirList}; #[test] fn zero_copy() { @@ -171,4 +265,32 @@ mod tests { assert!(matches!(dir.path, Cow::Borrowed(_))) } } + + #[test] + fn test_left_word_boundaries() { + assert!(left_word_boundaries("") == vec![]); + assert!(left_word_boundaries("Hi") == vec![0]); + + assert!(vec![0, 3] + .into_iter() + .collect::>() + .is_subset(&left_word_boundaries("hi there").into_iter().collect())); + assert!(vec![0, 3] + .into_iter() + .collect::>() + .is_subset(&left_word_boundaries("hi_there").into_iter().collect())); + + assert!(vec![0, 4] == left_word_boundaries("FürElise")); + assert!(vec![0, 1] == left_word_boundaries("uTorrent")); + assert!(vec![0, 2] == left_word_boundaries("µTorrent")); + + assert!(vec![1, 6, 11] + .into_iter() + .collect::>() + .is_subset(&left_word_boundaries("/path/file.ext").into_iter().collect())); + assert!(vec![0, 3, 8, 13] + .into_iter() + .collect::>() + .is_subset(&left_word_boundaries(r"C:\path\file.ext").into_iter().collect())); + } }