diff --git a/Cargo.lock b/Cargo.lock
index ee6aeb6f..aeef88f2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -725,4 +725,5 @@ dependencies = [
"rstest",
"serde",
"tempfile",
+ "unicode-segmentation",
]
diff --git a/Cargo.toml b/Cargo.toml
index e4873083..635ac16c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -20,6 +20,7 @@ glob = "0.3.0"
ordered-float = "2.0.0"
serde = { version = "1.0.116", features = ["derive"] }
tempfile = "3.1.0"
+unicode-segmentation = "1.8.0"
[target.'cfg(windows)'.dependencies]
rand = { version = "0.8.4", features = [
diff --git a/src/app/query.rs b/src/app/query.rs
index b888b6a2..0fa5a725 100644
--- a/src/app/query.rs
+++ b/src/app/query.rs
@@ -34,7 +34,7 @@ impl Query {
if self.interactive {
let mut fzf = Fzf::new(false)?;
while let Some(dir) = stream.next() {
- writeln!(fzf.stdin(), "{}", dir.display_score(now)).pipe_exit("fzf")?;
+ writeln!(fzf.stdin(), "{}", dir.display_score(now, Some(&self.keywords))).pipe_exit("fzf")?;
}
let selection = fzf.wait_select()?;
@@ -49,7 +49,7 @@ impl Query {
let handle = &mut stdout.lock();
while let Some(dir) = stream.next() {
if self.score {
- writeln!(handle, "{}", dir.display_score(now))
+ writeln!(handle, "{}", dir.display_score(now, Some(&self.keywords)))
} else {
writeln!(handle, "{}", dir.display())
}
@@ -59,7 +59,7 @@ impl Query {
} else {
let dir = stream.next().context("no match found")?;
if self.score {
- writeln!(io::stdout(), "{}", dir.display_score(now))
+ writeln!(io::stdout(), "{}", dir.display_score(now, Some(&self.keywords)))
} else {
writeln!(io::stdout(), "{}", dir.display())
}
diff --git a/src/app/remove.rs b/src/app/remove.rs
index 9175b2b8..b3b90ff4 100644
--- a/src/app/remove.rs
+++ b/src/app/remove.rs
@@ -22,7 +22,7 @@ impl Run for Remove {
let mut fzf = Fzf::new(true)?;
while let Some(dir) = stream.next() {
- writeln!(fzf.stdin(), "{}", dir.display_score(now)).pipe_exit("fzf")?;
+ writeln!(fzf.stdin(), "{}", dir.display_score(now, Some(keywords))).pipe_exit("fzf")?;
}
selection = fzf.wait_select()?;
diff --git a/src/db/dir.rs b/src/db/dir.rs
index 1ef62af5..7cf6934e 100644
--- a/src/db/dir.rs
+++ b/src/db/dir.rs
@@ -5,6 +5,7 @@ use std::ops::{Deref, DerefMut};
use anyhow::{bail, Context, Result};
use bincode::Options as _;
use serde::{Deserialize, Serialize};
+use unicode_segmentation::UnicodeSegmentation;
#[derive(Debug, Deserialize, Serialize, Default)]
pub struct DirList<'a>(#[serde(borrow)] pub Vec
>);
@@ -88,7 +89,7 @@ pub struct Dir<'a> {
}
impl Dir<'_> {
- pub fn score(&self, now: Epoch, _keywords: &Vec) -> Score {
+ pub fn score(&self, now: Epoch, keywords: &Vec) -> Score {
const HOUR: Epoch = 60 * 60;
const DAY: Epoch = 24 * HOUR;
const WEEK: Epoch = 7 * DAY;
@@ -105,22 +106,111 @@ impl Dir<'_> {
self.rank * 0.25
};
- // TODO: incorporate keywords into the scoring logic, so match quality is more significant
- // than the access date. See issue #260.
- let kw_score = 1;
+ let left_word_boundaries = left_word_boundaries(&self.path);
- (kw_score, adjusted_rank)
+ let mut kw_score_sum = 0;
+ for keyword in keywords {
+ kw_score_sum += self.compute_kw_score(keyword, &left_word_boundaries);
+ }
+
+ (kw_score_sum, adjusted_rank)
+ }
+
+ pub fn compute_kw_score(&self, keyword: &str, left_word_boundaries: &Vec) -> u64 {
+ let keyword_lower = &keyword.to_lowercase();
+ let path_lower = self.path.to_lowercase();
+
+ // more than one boundary can match
+ let mut best_boundary_score = 0;
+ for idx in left_word_boundaries {
+ // TODO: think carefully about these rules. Should the case of the match
+ // be allowed to influence the score? What if it's all lowercase, so
+ // a smart case match is impossible?
+ let path = &self.path[*idx..];
+ let path_lower = &path_lower[*idx..];
+ if path.starts_with(keyword) {
+ // exact match
+
+ // TODO: think about checking the right word boundary, and give extra points if it matches.
+ // Imagine two directories, src_3 and src. If src_3 is more frequently used, "sr" will
+ // match src_3. But "src" will match src.
+ best_boundary_score = best_boundary_score.max(100);
+ } else if path_lower.starts_with(keyword) {
+ // smart case match
+ best_boundary_score = best_boundary_score.max(90);
+ } else if path_lower.starts_with(keyword_lower) {
+ // wrong case but it's a match otherwise
+ best_boundary_score = best_boundary_score.max(20);
+ }
+
+ // We don't need to give any score for a keyword that matches but not on a word boundary--
+ // All paths being checked should at least match in that way.
+ }
+ debug_assert!(path_lower.contains(keyword_lower));
+
+ best_boundary_score
}
pub fn display(&self) -> DirDisplay {
DirDisplay { dir: self }
}
- pub fn display_score(&self, now: Epoch) -> DirDisplayScore {
- DirDisplayScore { dir: self, now }
+ pub fn display_score(&self, now: Epoch, keywords: Option<&Vec>) -> DirDisplayScore {
+ DirDisplayScore { dir: self, now, keywords: keywords.map(|vec| vec.iter().cloned().collect()) }
}
}
+/// Returns byte indices that correspond to the leftmost position of each word.
+/// For input "hi there", the result will contain 0 and 3.
+///
+/// The result may also contain extraneous indices.
+fn left_word_boundaries(text: &str) -> Vec {
+ let mut boundaries = Vec::new();
+
+ #[derive(PartialEq, Clone, Copy, PartialOrd)]
+ enum Case {
+ None,
+ LowerCase,
+ UpperCase,
+ }
+
+ // We won't need the words themselves because we want to do multi-word match.
+ // We need the whole string for that.
+ for (word_idx, word) in text.unicode_word_indices() {
+ boundaries.push(word_idx);
+
+ // Also search for case changes, and non-text characters:
+ // MyDocuments
+ // my_documents
+ // TODO: should "clap3b4" count as 4 words or 1?
+ let mut prev_case = None;
+ for (grapheme_idx, grapheme) in word.grapheme_indices(true) {
+ let lower = grapheme.to_lowercase();
+ let upper = grapheme.to_uppercase();
+ let case = if lower == grapheme && upper == grapheme {
+ Case::None
+ } else if lower == grapheme {
+ Case::LowerCase
+ } else {
+ // Assume the other cases are upper case, because there might be more than
+ // one way to represent upper case
+ Case::UpperCase
+ };
+
+ if let Some(prev_case) = &prev_case {
+ if case > *prev_case {
+ // Consider this a word start if going from no case to any case,
+ // or lower case to upper case.
+ boundaries.push(word_idx + grapheme_idx);
+ }
+ }
+ let _ = prev_case.replace(case);
+ }
+ }
+
+ boundaries
+}
+
pub struct DirDisplay<'a> {
dir: &'a Dir<'a>,
}
@@ -134,11 +224,15 @@ impl Display for DirDisplay<'_> {
pub struct DirDisplayScore<'a> {
dir: &'a Dir<'a>,
now: Epoch,
+ keywords: Option>,
}
impl Display for DirDisplayScore<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
- let (kw_score, score) = self.dir.score(self.now, &vec![]);
+ let no_keywords = Vec::default();
+ let keywords = self.keywords.as_ref().unwrap_or(&no_keywords);
+
+ let (kw_score, score) = self.dir.score(self.now, keywords);
let score = if score > 9999.0 {
9999
} else if score > 0.0 {
@@ -156,9 +250,9 @@ pub type Epoch = u64;
#[cfg(test)]
mod tests {
- use std::borrow::Cow;
+ use std::{borrow::Cow, collections::HashSet};
- use super::{Dir, DirList};
+ use super::{left_word_boundaries, Dir, DirList};
#[test]
fn zero_copy() {
@@ -171,4 +265,32 @@ mod tests {
assert!(matches!(dir.path, Cow::Borrowed(_)))
}
}
+
+ #[test]
+ fn test_left_word_boundaries() {
+ assert!(left_word_boundaries("") == vec![]);
+ assert!(left_word_boundaries("Hi") == vec![0]);
+
+ assert!(vec![0, 3]
+ .into_iter()
+ .collect::>()
+ .is_subset(&left_word_boundaries("hi there").into_iter().collect()));
+ assert!(vec![0, 3]
+ .into_iter()
+ .collect::>()
+ .is_subset(&left_word_boundaries("hi_there").into_iter().collect()));
+
+ assert!(vec![0, 4] == left_word_boundaries("FürElise"));
+ assert!(vec![0, 1] == left_word_boundaries("uTorrent"));
+ assert!(vec![0, 2] == left_word_boundaries("µTorrent"));
+
+ assert!(vec![1, 6, 11]
+ .into_iter()
+ .collect::>()
+ .is_subset(&left_word_boundaries("/path/file.ext").into_iter().collect()));
+ assert!(vec![0, 3, 8, 13]
+ .into_iter()
+ .collect::>()
+ .is_subset(&left_word_boundaries(r"C:\path\file.ext").into_iter().collect()));
+ }
}