Use word boundary detection and give scores based on keyword matching

I'm using a helper library to implement a unicode algorithm, but I'm also detecting case changes within a word (from lower to upper case, or no case to some case, so the "o" in "Documents" doesn't count as a new word). Words are not searched--rather, the string is searched starting at a word boundary. That way multi-word sequences will correctly match. This is a basic solution to #260. Some things to consider: - We don't have options to control the case. If smart-case is disabled, the weights in compute_kw_score need to change. - Right now keyword score totally overpowers the frequency score. The frequency score is only a tie-breaker. They could be normalized and weighted so a much better frequency score would win despite a slightly worse keyword score. - Should we detect word endings for exact matches? I'm not sure it would give a good user experience. If I frequently access "src9" but not "src", I don't want "src" to win just because it more exactly matches what I typed. It's hard to refrain from typing a whole word. This gives an interesting wrong result with "c" being a perfect match for "/mnt/c/anything". - The above issue can be solved if we consider digits to start a new word. - I'm testing these changes with `cargo r -- query --list --score b`.
ajeetdsouza · Sep 17, 2021 · d6f5db0 · d6f5db0
1 parent ec3ad20
commit d6f5db0
Show file tree

Hide file tree

Showing 5 changed files with 138 additions and 14 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -20,6 +20,7 @@ glob = "0.3.0"
 ordered-float = "2.0.0"
 serde = { version = "1.0.116", features = ["derive"] }
 tempfile = "3.1.0"
+unicode-segmentation = "1.8.0"
 
 [target.'cfg(windows)'.dependencies]
 rand = { version = "0.8.4", features = [

diff --git a/src/app/query.rs b/src/app/query.rs
@@ -34,7 +34,7 @@ impl Query {
         if self.interactive {
             let mut fzf = Fzf::new(false)?;
             while let Some(dir) = stream.next() {
-                writeln!(fzf.stdin(), "{}", dir.display_score(now)).pipe_exit("fzf")?;
+                writeln!(fzf.stdin(), "{}", dir.display_score(now, Some(&self.keywords))).pipe_exit("fzf")?;
             }
 
             let selection = fzf.wait_select()?;
@@ -49,7 +49,7 @@ impl Query {
             let handle = &mut stdout.lock();
             while let Some(dir) = stream.next() {
                 if self.score {
-                    writeln!(handle, "{}", dir.display_score(now))
+                    writeln!(handle, "{}", dir.display_score(now, Some(&self.keywords)))
                 } else {
                     writeln!(handle, "{}", dir.display())
                 }
@@ -59,7 +59,7 @@ impl Query {
         } else {
             let dir = stream.next().context("no match found")?;
             if self.score {
-                writeln!(io::stdout(), "{}", dir.display_score(now))
+                writeln!(io::stdout(), "{}", dir.display_score(now, Some(&self.keywords)))
             } else {
                 writeln!(io::stdout(), "{}", dir.display())
             }

diff --git a/src/app/remove.rs b/src/app/remove.rs
@@ -22,7 +22,7 @@ impl Run for Remove {
 
                 let mut fzf = Fzf::new(true)?;
                 while let Some(dir) = stream.next() {
-                    writeln!(fzf.stdin(), "{}", dir.display_score(now)).pipe_exit("fzf")?;
+                    writeln!(fzf.stdin(), "{}", dir.display_score(now, Some(keywords))).pipe_exit("fzf")?;
                 }
 
                 selection = fzf.wait_select()?;

diff --git a/src/db/dir.rs b/src/db/dir.rs
@@ -5,6 +5,7 @@ use std::ops::{Deref, DerefMut};
 use anyhow::{bail, Context, Result};
 use bincode::Options as _;
 use serde::{Deserialize, Serialize};
+use unicode_segmentation::UnicodeSegmentation;
 
 #[derive(Debug, Deserialize, Serialize, Default)]
 pub struct DirList<'a>(#[serde(borrow)] pub Vec<Dir<'a>>);
@@ -88,7 +89,7 @@ pub struct Dir<'a> {
 }
 
 impl Dir<'_> {
-    pub fn score(&self, now: Epoch, _keywords: &Vec<String>) -> Score {
+    pub fn score(&self, now: Epoch, keywords: &Vec<String>) -> Score {
         const HOUR: Epoch = 60 * 60;
         const DAY: Epoch = 24 * HOUR;
         const WEEK: Epoch = 7 * DAY;
@@ -105,22 +106,111 @@ impl Dir<'_> {
             self.rank * 0.25
         };
 
-        // TODO: incorporate keywords into the scoring logic, so match quality is more significant
-        // than the access date. See issue #260.
-        let kw_score = 1;
+        let left_word_boundaries = left_word_boundaries(&self.path);
 
-        (kw_score, adjusted_rank)
+        let mut kw_score_sum = 0;
+        for keyword in keywords {
+            kw_score_sum += self.compute_kw_score(keyword, &left_word_boundaries);
+        }
+
+        (kw_score_sum, adjusted_rank)
+    }
+
+    pub fn compute_kw_score(&self, keyword: &str, left_word_boundaries: &Vec<usize>) -> u64 {
+        let keyword_lower = &keyword.to_lowercase();
+        let path_lower = self.path.to_lowercase();
+
+        // more than one boundary can match
+        let mut best_boundary_score = 0;
+        for idx in left_word_boundaries {
+            // TODO: think carefully about these rules. Should the case of the match
+            // be allowed to influence the score? What if it's all lowercase, so
+            // a smart case match is impossible?
+            let path = &self.path[*idx..];
+            let path_lower = &path_lower[*idx..];
+            if path.starts_with(keyword) {
+                // exact match
+
+                // TODO: think about checking the right word boundary, and give extra points if it matches.
+                //       Imagine two directories, src_3 and src. If src_3 is more frequently used, "sr" will
+                //       match src_3. But "src" will match src.
+                best_boundary_score = best_boundary_score.max(100);
+            } else if path_lower.starts_with(keyword) {
+                // smart case match
+                best_boundary_score = best_boundary_score.max(90);
+            } else if path_lower.starts_with(keyword_lower) {
+                // wrong case but it's a match otherwise
+                best_boundary_score = best_boundary_score.max(20);
+            }
+
+            // We don't need to give any score for a keyword that matches but not on a word boundary--
+            // All paths being checked should at least match in that way.
+        }
+        debug_assert!(path_lower.contains(keyword_lower));
+
+        best_boundary_score
     }
 
     pub fn display(&self) -> DirDisplay {
         DirDisplay { dir: self }
     }
 
-    pub fn display_score(&self, now: Epoch) -> DirDisplayScore {
-        DirDisplayScore { dir: self, now }
+    pub fn display_score(&self, now: Epoch, keywords: Option<&Vec<String>>) -> DirDisplayScore {
+        DirDisplayScore { dir: self, now, keywords: keywords.map(|vec| vec.iter().cloned().collect()) }
     }
 }
 
+/// Returns byte indices that correspond to the leftmost position of each word.
+/// For input "hi there", the result will contain 0 and 3.
+///
+/// The result may also contain extraneous indices.
+fn left_word_boundaries(text: &str) -> Vec<usize> {
+    let mut boundaries = Vec::new();
+
+    #[derive(PartialEq, Clone, Copy, PartialOrd)]
+    enum Case {
+        None,
+        LowerCase,
+        UpperCase,
+    }
+
+    // We won't need the words themselves because we want to do multi-word match.
+    // We need the whole string for that.
+    for (word_idx, word) in text.unicode_word_indices() {
+        boundaries.push(word_idx);
+
+        // Also search for case changes, and non-text characters:
+        // MyDocuments
+        // my_documents
+        // TODO: should "clap3b4" count as 4 words or 1?
+        let mut prev_case = None;
+        for (grapheme_idx, grapheme) in word.grapheme_indices(true) {
+            let lower = grapheme.to_lowercase();
+            let upper = grapheme.to_uppercase();
+            let case = if lower == grapheme && upper == grapheme {
+                Case::None
+            } else if lower == grapheme {
+                Case::LowerCase
+            } else {
+                // Assume the other cases are upper case, because there might be more than
+                // one way to represent upper case
+                Case::UpperCase
+            };
+
+            if let Some(prev_case) = &prev_case {
+                if case > *prev_case {
+                    // Consider this a word start if going from no case to any case,
+                    // or lower case to upper case.
+                    boundaries.push(word_idx + grapheme_idx);
+                }
+            }
+            let _ = prev_case.replace(case);
+        }
+    }
+
+    boundaries
+}
+
 pub struct DirDisplay<'a> {
     dir: &'a Dir<'a>,
 }
@@ -134,11 +224,15 @@ impl Display for DirDisplay<'_> {
 pub struct DirDisplayScore<'a> {
     dir: &'a Dir<'a>,
     now: Epoch,
+    keywords: Option<Vec<String>>,
 }
 
 impl Display for DirDisplayScore<'_> {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
-        let (kw_score, score) = self.dir.score(self.now, &vec![]);
+        let no_keywords = Vec::default();
+        let keywords = self.keywords.as_ref().unwrap_or(&no_keywords);
+
+        let (kw_score, score) = self.dir.score(self.now, keywords);
         let score = if score > 9999.0 {
             9999
         } else if score > 0.0 {
@@ -156,9 +250,9 @@ pub type Epoch = u64;
 
 #[cfg(test)]
 mod tests {
-    use std::borrow::Cow;
+    use std::{borrow::Cow, collections::HashSet};
 
-    use super::{Dir, DirList};
+    use super::{left_word_boundaries, Dir, DirList};
 
     #[test]
     fn zero_copy() {
@@ -171,4 +265,32 @@ mod tests {
             assert!(matches!(dir.path, Cow::Borrowed(_)))
         }
     }
+
+    #[test]
+    fn test_left_word_boundaries() {
+        assert!(left_word_boundaries("") == vec![]);
+        assert!(left_word_boundaries("Hi") == vec![0]);
+
+        assert!(vec![0, 3]
+            .into_iter()
+            .collect::<HashSet<_>>()
+            .is_subset(&left_word_boundaries("hi there").into_iter().collect()));
+        assert!(vec![0, 3]
+            .into_iter()
+            .collect::<HashSet<_>>()
+            .is_subset(&left_word_boundaries("hi_there").into_iter().collect()));
+
+        assert!(vec![0, 4] == left_word_boundaries("FürElise"));
+        assert!(vec![0, 1] == left_word_boundaries("uTorrent"));
+        assert!(vec![0, 2] == left_word_boundaries("µTorrent"));
+
+        assert!(vec![1, 6, 11]
+            .into_iter()
+            .collect::<HashSet<_>>()
+            .is_subset(&left_word_boundaries("/path/file.ext").into_iter().collect()));
+        assert!(vec![0, 3, 8, 13]
+            .into_iter()
+            .collect::<HashSet<_>>()
+            .is_subset(&left_word_boundaries(r"C:\path\file.ext").into_iter().collect()));
+    }
 }