ajeetdsouza · lefth · Sep 17, 2021 · Sep 17, 2021 · Sep 13, 2021 · Nov 26, 2021
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -27,6 +27,7 @@ glob = "0.3.0"
 ordered-float = "2.0.0"
 serde = { version = "1.0.116", features = ["derive"] }
 tempfile = "3.1.0"
+unicode-segmentation = "1.8.0"
 
 [target.'cfg(windows)'.dependencies]
 rand = { version = "0.8.4", features = [

diff --git a/contrib/completions/_zoxide b/contrib/completions/_zoxide
diff --git a/contrib/completions/_zoxide.ps1 b/contrib/completions/_zoxide.ps1
diff --git a/contrib/completions/zoxide.elv b/contrib/completions/zoxide.elv
diff --git a/contrib/completions/zoxide.fish b/contrib/completions/zoxide.fish
diff --git a/contrib/completions/zoxide.ts b/contrib/completions/zoxide.ts
diff --git a/src/app/_app.rs b/src/app/_app.rs
@@ -111,7 +111,7 @@ pub struct Query {
     #[clap(long, short, conflicts_with = "interactive")]
     pub list: bool,
 
-    /// Print score with results
+    /// Print score with results (keyword match score, frequency score)
     #[clap(long, short, conflicts_with = "interactive")]
     pub score: bool,
 

diff --git a/src/app/query.rs b/src/app/query.rs
@@ -30,25 +30,26 @@ impl Query {
             stream = stream.with_exclude(path);
         }
 
+        let mut stream = stream.into_iter();
         if self.interactive {
             let mut fzf = Fzf::new(false)?;
             while let Some(dir) = stream.next() {
-                writeln!(fzf.stdin(), "{}", dir.display_score(now)).pipe_exit("fzf")?;
+                writeln!(fzf.stdin(), "{}", dir.display_score(now, Some(&self.keywords))).pipe_exit("fzf")?;
             }
 
             let selection = fzf.wait_select()?;
             if self.score {
                 print!("{}", selection);
             } else {
-                let path = selection.get(5..).context("could not read selection from fzf")?;
+                let path = selection.get(10..).context("could not read selection from fzf")?;
                 print!("{}", path);
             }
         } else if self.list {
             let stdout = io::stdout();
             let handle = &mut stdout.lock();
             while let Some(dir) = stream.next() {
                 if self.score {
-                    writeln!(handle, "{}", dir.display_score(now))
+                    writeln!(handle, "{}", dir.display_score(now, Some(&self.keywords)))
                 } else {
                     writeln!(handle, "{}", dir.display())
                 }
@@ -58,7 +59,7 @@ impl Query {
         } else {
             let dir = stream.next().context("no match found")?;
             if self.score {
-                writeln!(io::stdout(), "{}", dir.display_score(now))
+                writeln!(io::stdout(), "{}", dir.display_score(now, Some(&self.keywords)))
             } else {
                 writeln!(io::stdout(), "{}", dir.display())
             }

diff --git a/src/app/remove.rs b/src/app/remove.rs
@@ -18,15 +18,15 @@ impl Run for Remove {
         match &self.interactive {
             Some(keywords) => {
                 let now = util::current_time()?;
-                let mut stream = db.stream(now).with_keywords(keywords);
+                let mut stream = db.stream(now).with_keywords(keywords).into_iter();
 
                 let mut fzf = Fzf::new(true)?;
                 while let Some(dir) = stream.next() {
-                    writeln!(fzf.stdin(), "{}", dir.display_score(now)).pipe_exit("fzf")?;
+                    writeln!(fzf.stdin(), "{}", dir.display_score(now, Some(keywords))).pipe_exit("fzf")?;
                 }
 
                 selection = fzf.wait_select()?;
-                let paths = selection.lines().filter_map(|line| line.get(5..));
+                let paths = selection.lines().filter_map(|line| line.get(10..));
                 for path in paths {
                     if !db.remove(path) {
                         bail!("path not found in database: {}", path);

diff --git a/src/db/dir.rs b/src/db/dir.rs
@@ -1,12 +1,15 @@
 use std::borrow::Cow;
 use std::fmt::{self, Display, Formatter};
 use std::ops::{Deref, DerefMut};
+use std::path::PathBuf;
+use std::str::FromStr;
 
 use anyhow::{bail, Context, Result};
 use bincode::Options as _;
 use serde::{Deserialize, Serialize};
+use unicode_segmentation::UnicodeSegmentation;
 
-#[derive(Debug, Deserialize, Serialize)]
+#[derive(Debug, Deserialize, Serialize, Default)]
 pub struct DirList<'a>(#[serde(borrow)] pub Vec<Dir<'a>>);
 
 impl DirList<'_> {
@@ -88,33 +91,161 @@ pub struct Dir<'a> {
 }
 
 impl Dir<'_> {
-    pub fn score(&self, now: Epoch) -> Rank {
+    pub fn score(&self, now: Epoch, keywords: &Vec<String>) -> Score {
         const HOUR: Epoch = 60 * 60;
         const DAY: Epoch = 24 * HOUR;
         const WEEK: Epoch = 7 * DAY;
 
         // The older the entry, the lesser its importance.
         let duration = now.saturating_sub(self.last_accessed);
-        if duration < HOUR {
+        let adjusted_rank = if duration < HOUR {
             self.rank * 4.0
         } else if duration < DAY {
             self.rank * 2.0
         } else if duration < WEEK {
             self.rank * 0.5
         } else {
             self.rank * 0.25
+        };
+
+        for keyword in keywords {
+            debug_assert!(self.path.to_lowercase().contains(&keyword.to_lowercase()));
+        }
+
+        let mut kw_score_sum = 0;
+
+        let smart_case = keywords.iter().all(|kw| &kw.to_lowercase() == kw);
+
+        // Split the path into components, then words, so the "m" can be a better match
+        // for "folk music" than for "tom", and the best match for "music".
+        // And even more so if it's the last path component.
+        let path = PathBuf::from_str(&self.path).unwrap(); // safe because error is Infallible
+        let path_components = path.components();
+        let mut is_last_component = true;
+        for component in path_components.rev() {
+            let component = component.as_os_str().to_str().unwrap(); // safe because the path came from a string
+            let component = if smart_case { component.to_lowercase() } else { component.to_owned() };
+
+            let left_word_boundaries = left_word_boundaries(&component);
+            for keyword in keywords {
+                kw_score_sum +=
+                    Self::compute_kw_score(&component, keyword, &left_word_boundaries, smart_case, is_last_component);
+            }
+            is_last_component = false;
         }
+
+        (kw_score_sum, adjusted_rank)
+    }
+
+    pub fn compute_kw_score(
+        path_component: &str,
+        keyword: &str,
+        left_word_boundaries: &Vec<usize>,
+        smart_case: bool,
+        is_last_component: bool,
+    ) -> u64 {
+        let keyword_lower = &keyword.to_lowercase();
+        let path_lower = path_component.to_lowercase();
+
+        // more than one boundary can match
+        let mut best_boundary_score = 0;
+        for idx in left_word_boundaries {
+            // TODO: think carefully about these rules. Should the case of the match
+            // be allowed to influence the score? What if it's all lowercase, so
+            // a smart case match is impossible?
+            let word = &path_component[*idx..];
+            let word_lower = &path_lower[*idx..];
+            if word.starts_with(keyword) {
+                // exact match, but even better if it's at the leftmost position in the component,
+                // like "D" matching $HOME/Documents
+                let score = if *idx == 0 { 105 } else { 100 };
+
+                // TODO: think about checking the right word boundary, and give extra points if it matches.
+                //       Imagine two directories, src_3 and src. If src_3 is more frequently used, "sr" will
+                //       match src_3. But "src" will match src.
+                best_boundary_score = best_boundary_score.max(score);
+            } else if !smart_case && word_lower.starts_with(keyword) {
+                // smart case is off (a keyword has case), but this keyword alone would be a smart case match
+                // for the component.
+                best_boundary_score = best_boundary_score.max(25);
+            } else if word_lower.starts_with(keyword_lower) {
+                // wrong case but it's a match otherwise
+                best_boundary_score = best_boundary_score.max(20);
+            } else {
+                // No score. We don't need to give any score for a keyword that matches but not on a word boundary--
+                // All paths being checked should at least match in that way.
+                // But note that though the path will match the keyword, this path component may not match.
+            }
+        }
+
+        if best_boundary_score > 0 && is_last_component {
+            // matches in the last path component should be considered a little better
+            best_boundary_score += 5;
+        }
+
+        best_boundary_score
     }
 
     pub fn display(&self) -> DirDisplay {
         DirDisplay { dir: self }
     }
 
-    pub fn display_score(&self, now: Epoch) -> DirDisplayScore {
-        DirDisplayScore { dir: self, now }
+    pub fn display_score(&self, now: Epoch, keywords: Option<&Vec<String>>) -> DirDisplayScore {
+        DirDisplayScore { dir: self, now, keywords: keywords.map(|vec| vec.iter().cloned().collect()) }
     }
 }
 
+/// Returns byte indices that correspond to the leftmost position of each word.
+/// For input "hi there", the result will contain 0 and 3.
+///
+/// The result may also contain extraneous indices.
+fn left_word_boundaries(text: &str) -> Vec<usize> {
+    let mut boundaries = Vec::new();
+
+    #[derive(PartialEq, Clone, Copy, PartialOrd)]
+    enum Case {
+        None,
+        LowerCase,
+        UpperCase,
+    }
+
+    // We won't need the words themselves because we want to do multi-word match.
+    // We need the whole string for that.
+    for (word_idx, word) in text.unicode_word_indices() {
+        boundaries.push(word_idx);
+
+        // Also search for case changes, and non-text characters:
+        // MyDocuments
+        // my_documents
+        // TODO: should "clap3b4" count as 4 words or 1?
+        let mut prev_case = None;
+        for (grapheme_idx, grapheme) in word.grapheme_indices(true) {
+            let lower = grapheme.to_lowercase();
+            let upper = grapheme.to_uppercase();
+            let case = if lower == grapheme && upper == grapheme {
+                Case::None
+            } else if lower == grapheme {
+                Case::LowerCase
+            } else {
+                // Assume the other cases are upper case, because there might be more than
+                // one way to represent upper case
+                Case::UpperCase
+            };
+
+            if let Some(prev_case) = &prev_case {
+                if case > *prev_case {
+                    // Consider this a word start if going from no case to any case,
+                    // or lower case to upper case.
+                    boundaries.push(word_idx + grapheme_idx);
+                }
+            }
+            let _ = prev_case.replace(case);
+        }
+    }
+
+    boundaries
+}
+
 pub struct DirDisplay<'a> {
     dir: &'a Dir<'a>,
 }
@@ -128,30 +259,35 @@ impl Display for DirDisplay<'_> {
 pub struct DirDisplayScore<'a> {
     dir: &'a Dir<'a>,
     now: Epoch,
+    keywords: Option<Vec<String>>,
 }
 
 impl Display for DirDisplayScore<'_> {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
-        let score = self.dir.score(self.now);
+        let no_keywords = Vec::default();
+        let keywords = self.keywords.as_ref().unwrap_or(&no_keywords);
+
+        let (kw_score, score) = self.dir.score(self.now, keywords);
         let score = if score > 9999.0 {
             9999
         } else if score > 0.0 {
             score as u32
         } else {
             0
         };
-        write!(f, "{:>4} {}", score, self.dir.path)
+        write!(f, "{:>4},{:>4} {}", kw_score, score, self.dir.path)
     }
 }
 
 pub type Rank = f64;
+pub type Score = (u64, Rank);
 pub type Epoch = u64;
 
 #[cfg(test)]
 mod tests {
-    use std::borrow::Cow;
+    use std::{borrow::Cow, collections::HashSet};
 
-    use super::{Dir, DirList};
+    use super::{left_word_boundaries, Dir, DirList};
 
     #[test]
     fn zero_copy() {
@@ -164,4 +300,32 @@ mod tests {
             assert!(matches!(dir.path, Cow::Borrowed(_)))
         }
     }
+
+    #[test]
+    fn test_left_word_boundaries() {
+        assert!(left_word_boundaries("") == vec![]);
+        assert!(left_word_boundaries("Hi") == vec![0]);
+
+        assert!(vec![0, 3]
+            .into_iter()
+            .collect::<HashSet<_>>()
+            .is_subset(&left_word_boundaries("hi there").into_iter().collect()));
+        assert!(vec![0, 3]
+            .into_iter()
+            .collect::<HashSet<_>>()
+            .is_subset(&left_word_boundaries("hi_there").into_iter().collect()));
+
+        assert!(vec![0, 4] == left_word_boundaries("FürElise"));
+        assert!(vec![0, 1] == left_word_boundaries("uTorrent"));
+        assert!(vec![0, 2] == left_word_boundaries("µTorrent"));
+
+        assert!(vec![1, 6, 11]
+            .into_iter()
+            .collect::<HashSet<_>>()
+            .is_subset(&left_word_boundaries("/path/file.ext").into_iter().collect()));
+        assert!(vec![0, 3, 8, 13]
+            .into_iter()
+            .collect::<HashSet<_>>()
+            .is_subset(&left_word_boundaries(r"C:\path\file.ext").into_iter().collect()));
+    }
 }