From d55f827545aa18af3d7a7e6fdfaef2a416373ed5 Mon Sep 17 00:00:00 2001
From: Dan Zwell <devel@zwell.net>
Date: Fri, 17 Sep 2021 12:46:11 +0800
Subject: [PATCH 1/7] Make new iterator to delay sorting

This prevents keywords or options from being added after sorting is done.
So #260 can be implemented more safely.
---
 src/app/query.rs  |  1 +
 src/app/remove.rs |  2 +-
 src/db/dir.rs     |  2 +-
 src/db/stream.rs  | 73 ++++++++++++++++++++++++++++-------------------
 4 files changed, 46 insertions(+), 32 deletions(-)
diff --git a/src/app/query.rs b/src/app/query.rs
index cb1de833..8541189d 100644
--- a/src/app/query.rs
+++ b/src/app/query.rs
@@ -30,6 +30,7 @@ impl Query {
             stream = stream.with_exclude(path);
         }
 
+        let mut stream = stream.into_iter();
         if self.interactive {
             let mut fzf = Fzf::new(false)?;
             while let Some(dir) = stream.next() {
diff --git a/src/app/remove.rs b/src/app/remove.rs
index 18334712..ba952c39 100644
--- a/src/app/remove.rs
+++ b/src/app/remove.rs
@@ -18,7 +18,7 @@ impl Run for Remove {
         match &self.interactive {
             Some(keywords) => {
                 let now = util::current_time()?;
-                let mut stream = db.stream(now).with_keywords(keywords);
+                let mut stream = db.stream(now).with_keywords(keywords).into_iter();
 
                 let mut fzf = Fzf::new(true)?;
                 while let Some(dir) = stream.next() {
diff --git a/src/db/dir.rs b/src/db/dir.rs
index 1661a1fe..c133a48b 100644
--- a/src/db/dir.rs
+++ b/src/db/dir.rs
@@ -6,7 +6,7 @@ use anyhow::{bail, Context, Result};
 use bincode::Options as _;
 use serde::{Deserialize, Serialize};
 
-#[derive(Debug, Deserialize, Serialize)]
+#[derive(Debug, Deserialize, Serialize, Default)]
 pub struct DirList<'a>(#[serde(borrow)] pub Vec<Dir<'a>>);
 
 impl DirList<'_> {
diff --git a/src/db/stream.rs b/src/db/stream.rs
index e5d3eb98..fa7d6621 100644
--- a/src/db/stream.rs
+++ b/src/db/stream.rs
@@ -9,7 +9,6 @@ use crate::util;
 
 pub struct Stream<'db, 'file> {
     db: &'db mut Database<'file>,
-    idxs: Rev<Range<usize>>,
 
     keywords: Vec<String>,
 
@@ -18,25 +17,22 @@ pub struct Stream<'db, 'file> {
     resolve_symlinks: bool,
 
     exclude_path: Option<String>,
+    now: Epoch,
 }
 
 impl<'db, 'file> Stream<'db, 'file> {
     pub fn new(db: &'db mut Database<'file>, now: Epoch) -> Self {
-        // Iterate in descending order of score.
-        db.dirs.sort_unstable_by_key(|dir| OrderedFloat(dir.score(now)));
-        let idxs = (0..db.dirs.len()).rev();
-
         // If a directory is deleted and hasn't been used for 90 days, delete it from the database.
         let expire_below = now.saturating_sub(90 * 24 * 60 * 60);
 
         Stream {
             db,
-            idxs,
             keywords: Vec::new(),
             check_exists: false,
             expire_below,
             resolve_symlinks: false,
             exclude_path: None,
+            now,
         }
     }
 
@@ -56,31 +52,14 @@ impl<'db, 'file> Stream<'db, 'file> {
         self
     }
 
-    pub fn next(&mut self) -> Option<&Dir<'file>> {
-        while let Some(idx) = self.idxs.next() {
-            let dir = &self.db.dirs[idx];
-
-            if !self.matches_keywords(&dir.path) {
-                continue;
-            }
-
-            if !self.matches_exists(&dir.path) {
-                if dir.last_accessed < self.expire_below {
-                    self.db.dirs.swap_remove(idx);
-                    self.db.modified = true;
-                }
-                continue;
-            }
-
-            if Some(dir.path.as_ref()) == self.exclude_path.as_deref() {
-                continue;
-            }
-
-            let dir = &self.db.dirs[idx];
-            return Some(dir);
-        }
+    pub fn into_iter(self) -> StreamIterator<'db, 'file> {
+        let mut dirs = std::mem::take(&mut self.db.dirs);
+        // Iterate in descending order of score.
+        dirs.sort_unstable_by_key(|dir| OrderedFloat(dir.score(self.now)));
+        let _ = std::mem::replace(&mut self.db.dirs, dirs);
+        let idxs = (0..self.db.dirs.len()).rev();
 
-        None
+        StreamIterator { stream: self, idxs }
     }
 
     fn matches_exists<S: AsRef<str>>(&self, path: S) -> bool {
@@ -120,6 +99,40 @@ impl<'db, 'file> Stream<'db, 'file> {
     }
 }
 
+pub struct StreamIterator<'db, 'file> {
+    stream: Stream<'db, 'file>,
+    idxs: Rev<Range<usize>>,
+}
+
+impl<'db, 'file> StreamIterator<'db, 'file> {
+    pub fn next(&mut self) -> Option<&Dir<'file>> {
+        while let Some(idx) = self.idxs.next() {
+            let dir = &self.stream.db.dirs[idx];
+
+            if !self.stream.matches_keywords(&dir.path) {
+                continue;
+            }
+
+            if !self.stream.matches_exists(&dir.path) {
+                if dir.last_accessed < self.stream.expire_below {
+                    self.stream.db.dirs.swap_remove(idx);
+                    self.stream.db.modified = true;
+                }
+                continue;
+            }
+
+            if Some(dir.path.as_ref()) == self.stream.exclude_path.as_deref() {
+                continue;
+            }
+
+            let dir = &self.stream.db.dirs[idx];
+            return Some(dir);
+        }
+
+        None
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::path::PathBuf;

From 6eba7d0d5673d4de395f514e2c6c19d37e2f7f07 Mon Sep 17 00:00:00 2001
From: Dan Zwell <devel@zwell.net>
Date: Fri, 17 Sep 2021 18:41:25 +0800
Subject: [PATCH 2/7] Preliminary: score directories based on search keywords

Keyword-based scoring is currently a noop. Directory filtering is done
before scoring, except for a mutating filter that's complex to execute
earlier.

This is a step toward implemnting #260.
---
 src/app/query.rs  |  2 +-
 src/app/remove.rs |  2 +-
 src/db/dir.rs     | 17 ++++++++++++-----
 src/db/stream.rs  | 30 ++++++++++++++----------------
 4 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/src/app/query.rs b/src/app/query.rs
index 8541189d..b888b6a2 100644
--- a/src/app/query.rs
+++ b/src/app/query.rs
@@ -41,7 +41,7 @@ impl Query {
             if self.score {
                 print!("{}", selection);
             } else {
-                let path = selection.get(5..).context("could not read selection from fzf")?;
+                let path = selection.get(10..).context("could not read selection from fzf")?;
                 print!("{}", path);
             }
         } else if self.list {
diff --git a/src/app/remove.rs b/src/app/remove.rs
index ba952c39..9175b2b8 100644
--- a/src/app/remove.rs
+++ b/src/app/remove.rs
@@ -26,7 +26,7 @@ impl Run for Remove {
                 }
 
                 selection = fzf.wait_select()?;
-                let paths = selection.lines().filter_map(|line| line.get(5..));
+                let paths = selection.lines().filter_map(|line| line.get(10..));
                 for path in paths {
                     if !db.remove(path) {
                         bail!("path not found in database: {}", path);
diff --git a/src/db/dir.rs b/src/db/dir.rs
index c133a48b..1ef62af5 100644
--- a/src/db/dir.rs
+++ b/src/db/dir.rs
@@ -88,14 +88,14 @@ pub struct Dir<'a> {
 }
 
 impl Dir<'_> {
-    pub fn score(&self, now: Epoch) -> Rank {
+    pub fn score(&self, now: Epoch, _keywords: &Vec<String>) -> Score {
         const HOUR: Epoch = 60 * 60;
         const DAY: Epoch = 24 * HOUR;
         const WEEK: Epoch = 7 * DAY;
 
         // The older the entry, the lesser its importance.
         let duration = now.saturating_sub(self.last_accessed);
-        if duration < HOUR {
+        let adjusted_rank = if duration < HOUR {
             self.rank * 4.0
         } else if duration < DAY {
             self.rank * 2.0
@@ -103,7 +103,13 @@ impl Dir<'_> {
             self.rank * 0.5
         } else {
             self.rank * 0.25
-        }
+        };
+
+        // TODO: incorporate keywords into the scoring logic, so match quality is more significant
+        // than the access date. See issue #260.
+        let kw_score = 1;
+
+        (kw_score, adjusted_rank)
     }
 
     pub fn display(&self) -> DirDisplay {
@@ -132,7 +138,7 @@ pub struct DirDisplayScore<'a> {
 
 impl Display for DirDisplayScore<'_> {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
-        let score = self.dir.score(self.now);
+        let (kw_score, score) = self.dir.score(self.now, &vec![]);
         let score = if score > 9999.0 {
             9999
         } else if score > 0.0 {
@@ -140,11 +146,12 @@ impl Display for DirDisplayScore<'_> {
         } else {
             0
         };
-        write!(f, "{:>4} {}", score, self.dir.path)
+        write!(f, "{:>4},{:>4} {}", kw_score, score, self.dir.path)
     }
 }
 
 pub type Rank = f64;
+pub type Score = (u64, Rank);
 pub type Epoch = u64;
 
 #[cfg(test)]
diff --git a/src/db/stream.rs b/src/db/stream.rs
index fa7d6621..0204134b 100644
--- a/src/db/stream.rs
+++ b/src/db/stream.rs
@@ -1,5 +1,3 @@
-use std::iter::Rev;
-use std::ops::Range;
 use std::{fs, path};
 
 use ordered_float::OrderedFloat;
@@ -53,13 +51,21 @@ impl<'db, 'file> Stream<'db, 'file> {
     }
 
     pub fn into_iter(self) -> StreamIterator<'db, 'file> {
-        let mut dirs = std::mem::take(&mut self.db.dirs);
+        let mut idxs: Vec<_> = self.db.dirs.iter()
+            .enumerate() // store the original indices before filtering
+            .filter(|(_idx, dir)|
+                self.matches_keywords(&dir.path) &&
+                    Some(dir.path.as_ref()) != self.exclude_path.as_deref())
+            .collect();
+
         // Iterate in descending order of score.
-        dirs.sort_unstable_by_key(|dir| OrderedFloat(dir.score(self.now)));
-        let _ = std::mem::replace(&mut self.db.dirs, dirs);
-        let idxs = (0..self.db.dirs.len()).rev();
+        idxs.sort_by_cached_key(|(_idx, dir)| {
+            let (kw_score, frequency_score) = dir.score(self.now, &self.keywords);
+            (kw_score, OrderedFloat(frequency_score))
+        });
+        let idxs = idxs.into_iter().map(|(idx, _)| idx).rev().collect::<Vec<_>>().into_iter(); // copy the indices to avoid lifetime issues
 
-        StreamIterator { stream: self, idxs }
+        StreamIterator { stream: self, idxs: Box::new(idxs) }
     }
 
     fn matches_exists<S: AsRef<str>>(&self, path: S) -> bool {
@@ -101,7 +107,7 @@ impl<'db, 'file> Stream<'db, 'file> {
 
 pub struct StreamIterator<'db, 'file> {
     stream: Stream<'db, 'file>,
-    idxs: Rev<Range<usize>>,
+    idxs: Box<dyn Iterator<Item = usize>>,
 }
 
 impl<'db, 'file> StreamIterator<'db, 'file> {
@@ -109,10 +115,6 @@ impl<'db, 'file> StreamIterator<'db, 'file> {
         while let Some(idx) = self.idxs.next() {
             let dir = &self.stream.db.dirs[idx];
 
-            if !self.stream.matches_keywords(&dir.path) {
-                continue;
-            }
-
             if !self.stream.matches_exists(&dir.path) {
                 if dir.last_accessed < self.stream.expire_below {
                     self.stream.db.dirs.swap_remove(idx);
@@ -121,10 +123,6 @@ impl<'db, 'file> StreamIterator<'db, 'file> {
                 continue;
             }
 
-            if Some(dir.path.as_ref()) == self.stream.exclude_path.as_deref() {
-                continue;
-            }
-
             let dir = &self.stream.db.dirs[idx];
             return Some(dir);
         }

From 21886e506cfc69efa3dd66899c1161c43cea77bd Mon Sep 17 00:00:00 2001
From: Dan Zwell <devel@zwell.net>
Date: Mon, 13 Sep 2021 14:34:48 +0800
Subject: [PATCH 3/7] Use word boundary detection and give scores based on
 keyword matching

I'm using a helper library to implement a unicode algorithm, but I'm also
detecting case changes within a word (from lower to upper case, or no case
to some case, so the "o" in "Documents" doesn't count as a new word).

Words are not searched--rather, the string is searched starting at a word
boundary. That way multi-word sequences will correctly match.

This is a basic solution to #260.

Some things to consider:
- We don't have options to control the case. If smart-case is disabled,
  the weights in compute_kw_score need to change.
- Right now keyword score totally overpowers the frequency score. The
  frequency score is only a tie-breaker. They could be normalized and
  weighted so a much better frequency score would win despite a slightly
  worse keyword score.
- Should we detect word endings for exact matches? I'm not sure it would
  give a good user experience. If I frequently access "src9" but not "src",
  I don't want "src" to win just because it more exactly matches what I
  typed. It's hard to refrain from typing a whole word. This gives an
  interesting wrong result with "c" being a perfect match for
  "/mnt/c/anything".
- The above issue can be solved if we consider digits to start a new word.
- I'm testing these changes with `cargo r -- query --list --score b`.
---
 Cargo.lock        |   1 +
 Cargo.toml        |   1 +
 src/app/query.rs  |   6 +-
 src/app/remove.rs |   2 +-
 src/db/dir.rs     | 142 ++++++++++++++++++++++++++++++++++++++++++----
 5 files changed, 138 insertions(+), 14 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e897a1cd..220535a3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -875,4 +875,5 @@ dependencies = [
  "rstest",
  "serde",
  "tempfile",
+ "unicode-segmentation",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 61c00d0f..a40a07b9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -27,6 +27,7 @@ glob = "0.3.0"
 ordered-float = "2.0.0"
 serde = { version = "1.0.116", features = ["derive"] }
 tempfile = "3.1.0"
+unicode-segmentation = "1.8.0"
 
 [target.'cfg(windows)'.dependencies]
 rand = { version = "0.8.4", features = [
diff --git a/src/app/query.rs b/src/app/query.rs
index b888b6a2..0fa5a725 100644
--- a/src/app/query.rs
+++ b/src/app/query.rs
@@ -34,7 +34,7 @@ impl Query {
         if self.interactive {
             let mut fzf = Fzf::new(false)?;
             while let Some(dir) = stream.next() {
-                writeln!(fzf.stdin(), "{}", dir.display_score(now)).pipe_exit("fzf")?;
+                writeln!(fzf.stdin(), "{}", dir.display_score(now, Some(&self.keywords))).pipe_exit("fzf")?;
             }
 
             let selection = fzf.wait_select()?;
@@ -49,7 +49,7 @@ impl Query {
             let handle = &mut stdout.lock();
             while let Some(dir) = stream.next() {
                 if self.score {
-                    writeln!(handle, "{}", dir.display_score(now))
+                    writeln!(handle, "{}", dir.display_score(now, Some(&self.keywords)))
                 } else {
                     writeln!(handle, "{}", dir.display())
                 }
@@ -59,7 +59,7 @@ impl Query {
         } else {
             let dir = stream.next().context("no match found")?;
             if self.score {
-                writeln!(io::stdout(), "{}", dir.display_score(now))
+                writeln!(io::stdout(), "{}", dir.display_score(now, Some(&self.keywords)))
             } else {
                 writeln!(io::stdout(), "{}", dir.display())
             }
diff --git a/src/app/remove.rs b/src/app/remove.rs
index 9175b2b8..b3b90ff4 100644
--- a/src/app/remove.rs
+++ b/src/app/remove.rs
@@ -22,7 +22,7 @@ impl Run for Remove {
 
                 let mut fzf = Fzf::new(true)?;
                 while let Some(dir) = stream.next() {
-                    writeln!(fzf.stdin(), "{}", dir.display_score(now)).pipe_exit("fzf")?;
+                    writeln!(fzf.stdin(), "{}", dir.display_score(now, Some(keywords))).pipe_exit("fzf")?;
                 }
 
                 selection = fzf.wait_select()?;
diff --git a/src/db/dir.rs b/src/db/dir.rs
index 1ef62af5..7cf6934e 100644
--- a/src/db/dir.rs
+++ b/src/db/dir.rs
@@ -5,6 +5,7 @@ use std::ops::{Deref, DerefMut};
 use anyhow::{bail, Context, Result};
 use bincode::Options as _;
 use serde::{Deserialize, Serialize};
+use unicode_segmentation::UnicodeSegmentation;
 
 #[derive(Debug, Deserialize, Serialize, Default)]
 pub struct DirList<'a>(#[serde(borrow)] pub Vec<Dir<'a>>);
@@ -88,7 +89,7 @@ pub struct Dir<'a> {
 }
 
 impl Dir<'_> {
-    pub fn score(&self, now: Epoch, _keywords: &Vec<String>) -> Score {
+    pub fn score(&self, now: Epoch, keywords: &Vec<String>) -> Score {
         const HOUR: Epoch = 60 * 60;
         const DAY: Epoch = 24 * HOUR;
         const WEEK: Epoch = 7 * DAY;
@@ -105,22 +106,111 @@ impl Dir<'_> {
             self.rank * 0.25
         };
 
-        // TODO: incorporate keywords into the scoring logic, so match quality is more significant
-        // than the access date. See issue #260.
-        let kw_score = 1;
+        let left_word_boundaries = left_word_boundaries(&self.path);
 
-        (kw_score, adjusted_rank)
+        let mut kw_score_sum = 0;
+        for keyword in keywords {
+            kw_score_sum += self.compute_kw_score(keyword, &left_word_boundaries);
+        }
+
+        (kw_score_sum, adjusted_rank)
+    }
+
+    pub fn compute_kw_score(&self, keyword: &str, left_word_boundaries: &Vec<usize>) -> u64 {
+        let keyword_lower = &keyword.to_lowercase();
+        let path_lower = self.path.to_lowercase();
+
+        // more than one boundary can match
+        let mut best_boundary_score = 0;
+        for idx in left_word_boundaries {
+            // TODO: think carefully about these rules. Should the case of the match
+            // be allowed to influence the score? What if it's all lowercase, so
+            // a smart case match is impossible?
+            let path = &self.path[*idx..];
+            let path_lower = &path_lower[*idx..];
+            if path.starts_with(keyword) {
+                // exact match
+
+                // TODO: think about checking the right word boundary, and give extra points if it matches.
+                //       Imagine two directories, src_3 and src. If src_3 is more frequently used, "sr" will
+                //       match src_3. But "src" will match src.
+                best_boundary_score = best_boundary_score.max(100);
+            } else if path_lower.starts_with(keyword) {
+                // smart case match
+                best_boundary_score = best_boundary_score.max(90);
+            } else if path_lower.starts_with(keyword_lower) {
+                // wrong case but it's a match otherwise
+                best_boundary_score = best_boundary_score.max(20);
+            }
+
+            // We don't need to give any score for a keyword that matches but not on a word boundary--
+            // All paths being checked should at least match in that way.
+        }
+        debug_assert!(path_lower.contains(keyword_lower));
+
+        best_boundary_score
     }
 
     pub fn display(&self) -> DirDisplay {
         DirDisplay { dir: self }
     }
 
-    pub fn display_score(&self, now: Epoch) -> DirDisplayScore {
-        DirDisplayScore { dir: self, now }
+    pub fn display_score(&self, now: Epoch, keywords: Option<&Vec<String>>) -> DirDisplayScore {
+        DirDisplayScore { dir: self, now, keywords: keywords.map(|vec| vec.iter().cloned().collect()) }
     }
 }
 
+/// Returns byte indices that correspond to the leftmost position of each word.
+/// For input "hi there", the result will contain 0 and 3.
+///
+/// The result may also contain extraneous indices.
+fn left_word_boundaries(text: &str) -> Vec<usize> {
+    let mut boundaries = Vec::new();
+
+    #[derive(PartialEq, Clone, Copy, PartialOrd)]
+    enum Case {
+        None,
+        LowerCase,
+        UpperCase,
+    }
+
+    // We won't need the words themselves because we want to do multi-word match.
+    // We need the whole string for that.
+    for (word_idx, word) in text.unicode_word_indices() {
+        boundaries.push(word_idx);
+
+        // Also search for case changes, and non-text characters:
+        // MyDocuments
+        // my_documents
+        // TODO: should "clap3b4" count as 4 words or 1?
+        let mut prev_case = None;
+        for (grapheme_idx, grapheme) in word.grapheme_indices(true) {
+            let lower = grapheme.to_lowercase();
+            let upper = grapheme.to_uppercase();
+            let case = if lower == grapheme && upper == grapheme {
+                Case::None
+            } else if lower == grapheme {
+                Case::LowerCase
+            } else {
+                // Assume the other cases are upper case, because there might be more than
+                // one way to represent upper case
+                Case::UpperCase
+            };
+
+            if let Some(prev_case) = &prev_case {
+                if case > *prev_case {
+                    // Consider this a word start if going from no case to any case,
+                    // or lower case to upper case.
+                    boundaries.push(word_idx + grapheme_idx);
+                }
+            }
+            let _ = prev_case.replace(case);
+        }
+    }
+
+    boundaries
+}
+
 pub struct DirDisplay<'a> {
     dir: &'a Dir<'a>,
 }
@@ -134,11 +224,15 @@ impl Display for DirDisplay<'_> {
 pub struct DirDisplayScore<'a> {
     dir: &'a Dir<'a>,
     now: Epoch,
+    keywords: Option<Vec<String>>,
 }
 
 impl Display for DirDisplayScore<'_> {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
-        let (kw_score, score) = self.dir.score(self.now, &vec![]);
+        let no_keywords = Vec::default();
+        let keywords = self.keywords.as_ref().unwrap_or(&no_keywords);
+
+        let (kw_score, score) = self.dir.score(self.now, keywords);
         let score = if score > 9999.0 {
             9999
         } else if score > 0.0 {
@@ -156,9 +250,9 @@ pub type Epoch = u64;
 
 #[cfg(test)]
 mod tests {
-    use std::borrow::Cow;
+    use std::{borrow::Cow, collections::HashSet};
 
-    use super::{Dir, DirList};
+    use super::{left_word_boundaries, Dir, DirList};
 
     #[test]
     fn zero_copy() {
@@ -171,4 +265,32 @@ mod tests {
             assert!(matches!(dir.path, Cow::Borrowed(_)))
         }
     }
+
+    #[test]
+    fn test_left_word_boundaries() {
+        assert!(left_word_boundaries("") == vec![]);
+        assert!(left_word_boundaries("Hi") == vec![0]);
+
+        assert!(vec![0, 3]
+            .into_iter()
+            .collect::<HashSet<_>>()
+            .is_subset(&left_word_boundaries("hi there").into_iter().collect()));
+        assert!(vec![0, 3]
+            .into_iter()
+            .collect::<HashSet<_>>()
+            .is_subset(&left_word_boundaries("hi_there").into_iter().collect()));
+
+        assert!(vec![0, 4] == left_word_boundaries("FürElise"));
+        assert!(vec![0, 1] == left_word_boundaries("uTorrent"));
+        assert!(vec![0, 2] == left_word_boundaries("µTorrent"));
+
+        assert!(vec![1, 6, 11]
+            .into_iter()
+            .collect::<HashSet<_>>()
+            .is_subset(&left_word_boundaries("/path/file.ext").into_iter().collect()));
+        assert!(vec![0, 3, 8, 13]
+            .into_iter()
+            .collect::<HashSet<_>>()
+            .is_subset(&left_word_boundaries(r"C:\path\file.ext").into_iter().collect()));
+    }
 }

From 23136619b3bc423351d4cb6429fe2fffd96a1dff Mon Sep 17 00:00:00 2001
From: Dan Zwell <devel@zwell.net>
Date: Fri, 26 Nov 2021 13:51:59 +0800
Subject: [PATCH 4/7] Update documentation and completion strings: describe the
 score format.

---
 contrib/completions/_zoxide     | 4 ++--
 contrib/completions/_zoxide.ps1 | 4 ++--
 contrib/completions/zoxide.elv  | 4 ++--
 contrib/completions/zoxide.fish | 2 +-
 contrib/completions/zoxide.ts   | 2 +-
 src/app/_app.rs                 | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/contrib/completions/_zoxide b/contrib/completions/_zoxide
index 178a27d3..2fe0400d 100644
--- a/contrib/completions/_zoxide
+++ b/contrib/completions/_zoxide
@@ -68,8 +68,8 @@ _arguments "${_arguments_options[@]}" \
 '(-l --list)--interactive[Use interactive selection]' \
 '(-i --interactive)-l[List all matching directories]' \
 '(-i --interactive)--list[List all matching directories]' \
-'(-i --interactive)-s[Print score with results]' \
-'(-i --interactive)--score[Print score with results]' \
+'(-i --interactive)-s[Print score with results (keyword match score, frequency score)]' \
+'(-i --interactive)--score[Print score with results (keyword match score, frequency score)]' \
 '-h[Print help information]' \
 '--help[Print help information]' \
 '-V[Print version information]' \
diff --git a/contrib/completions/_zoxide.ps1 b/contrib/completions/_zoxide.ps1
index 72138e63..4a427e6a 100644
--- a/contrib/completions/_zoxide.ps1
+++ b/contrib/completions/_zoxide.ps1
@@ -64,8 +64,8 @@ Register-ArgumentCompleter -Native -CommandName 'zoxide' -ScriptBlock {
             [CompletionResult]::new('--interactive', 'interactive', [CompletionResultType]::ParameterName, 'Use interactive selection')
             [CompletionResult]::new('-l', 'l', [CompletionResultType]::ParameterName, 'List all matching directories')
             [CompletionResult]::new('--list', 'list', [CompletionResultType]::ParameterName, 'List all matching directories')
-            [CompletionResult]::new('-s', 's', [CompletionResultType]::ParameterName, 'Print score with results')
-            [CompletionResult]::new('--score', 'score', [CompletionResultType]::ParameterName, 'Print score with results')
+            [CompletionResult]::new('-s', 's', [CompletionResultType]::ParameterName, 'Print score with results (keyword match score, frequency score)')
+            [CompletionResult]::new('--score', 'score', [CompletionResultType]::ParameterName, 'Print score with results (keyword match score, frequency score)')
             [CompletionResult]::new('-h', 'h', [CompletionResultType]::ParameterName, 'Print help information')
             [CompletionResult]::new('--help', 'help', [CompletionResultType]::ParameterName, 'Print help information')
             [CompletionResult]::new('-V', 'V', [CompletionResultType]::ParameterName, 'Print version information')
diff --git a/contrib/completions/zoxide.elv b/contrib/completions/zoxide.elv
index dfdebc23..5783df10 100644
--- a/contrib/completions/zoxide.elv
+++ b/contrib/completions/zoxide.elv
@@ -58,8 +58,8 @@ set edit:completion:arg-completer[zoxide] = [@words]{
             cand --interactive 'Use interactive selection'
             cand -l 'List all matching directories'
             cand --list 'List all matching directories'
-            cand -s 'Print score with results'
-            cand --score 'Print score with results'
+            cand -s 'Print score with results (keyword match score, frequency score)'
+            cand --score 'Print score with results (keyword match score, frequency score)'
             cand -h 'Print help information'
             cand --help 'Print help information'
             cand -V 'Print version information'
diff --git a/contrib/completions/zoxide.fish b/contrib/completions/zoxide.fish
index 1ca8db01..3d4a6077 100644
--- a/contrib/completions/zoxide.fish
+++ b/contrib/completions/zoxide.fish
@@ -20,7 +20,7 @@ complete -c zoxide -n "__fish_seen_subcommand_from query" -l exclude -d 'Exclude
 complete -c zoxide -n "__fish_seen_subcommand_from query" -l all -d 'Show deleted directories'
 complete -c zoxide -n "__fish_seen_subcommand_from query" -s i -l interactive -d 'Use interactive selection'
 complete -c zoxide -n "__fish_seen_subcommand_from query" -s l -l list -d 'List all matching directories'
-complete -c zoxide -n "__fish_seen_subcommand_from query" -s s -l score -d 'Print score with results'
+complete -c zoxide -n "__fish_seen_subcommand_from query" -s s -l score -d 'Print score with results (keyword match score, frequency score)'
 complete -c zoxide -n "__fish_seen_subcommand_from query" -s h -l help -d 'Print help information'
 complete -c zoxide -n "__fish_seen_subcommand_from query" -s V -l version -d 'Print version information'
 complete -c zoxide -n "__fish_seen_subcommand_from remove" -s i -l interactive -r
diff --git a/contrib/completions/zoxide.ts b/contrib/completions/zoxide.ts
index 0c41a758..b86362a1 100644
--- a/contrib/completions/zoxide.ts
+++ b/contrib/completions/zoxide.ts
@@ -159,7 +159,7 @@ const completion: Fig.Spec = {
         },
         {
           name: ["-s", "--score"],
-          description: "Print score with results",
+          description: "Print score with results (keyword match score, frequency score)",
         },
         {
           name: ["-h", "--help"],
diff --git a/src/app/_app.rs b/src/app/_app.rs
index e40e3e74..941358b9 100644
--- a/src/app/_app.rs
+++ b/src/app/_app.rs
@@ -111,7 +111,7 @@ pub struct Query {
     #[clap(long, short, conflicts_with = "interactive")]
     pub list: bool,
 
-    /// Print score with results
+    /// Print score with results (keyword match score, frequency score)
     #[clap(long, short, conflicts_with = "interactive")]
     pub score: bool,
 

From f94db841a923f20650c8d798dbe361828ec982c9 Mon Sep 17 00:00:00 2001
From: Dan Zwell <devel@zwell.net>
Date: Sat, 27 Nov 2021 11:06:23 +0800
Subject: [PATCH 5/7] Matches in the last path component should be considered a
 little better than other matches.

---
 src/db/dir.rs | 52 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 39 insertions(+), 13 deletions(-)

diff --git a/src/db/dir.rs b/src/db/dir.rs
index 7cf6934e..29bca175 100644
--- a/src/db/dir.rs
+++ b/src/db/dir.rs
@@ -1,6 +1,8 @@
 use std::borrow::Cow;
 use std::fmt::{self, Display, Formatter};
 use std::ops::{Deref, DerefMut};
+use std::path::PathBuf;
+use std::str::FromStr;
 
 use anyhow::{bail, Context, Result};
 use bincode::Options as _;
@@ -106,19 +108,38 @@ impl Dir<'_> {
             self.rank * 0.25
         };
 
-        let left_word_boundaries = left_word_boundaries(&self.path);
+        for keyword in keywords {
+            debug_assert!(self.path.to_lowercase().contains(&keyword.to_lowercase()));
+        }
 
         let mut kw_score_sum = 0;
-        for keyword in keywords {
-            kw_score_sum += self.compute_kw_score(keyword, &left_word_boundaries);
+
+        // Split the path into components, then words, so the "M" can be a better match
+        // for "folk music" than for "tom", and the best match for "music".
+        // And even more so if it's the last path component.
+        let path = PathBuf::from_str(&self.path).unwrap(); // safe because error is Infallible
+        let path_components = path.components();
+        let mut is_last_component = true;
+        for component in path_components.rev() {
+            let component = component.as_os_str().to_str().unwrap(); // safe because the path came from a string
+            let left_word_boundaries = left_word_boundaries(&component);
+            for keyword in keywords {
+                kw_score_sum += Self::compute_kw_score(&component, keyword, &left_word_boundaries, is_last_component);
+            }
+            is_last_component = false;
         }
 
         (kw_score_sum, adjusted_rank)
     }
 
-    pub fn compute_kw_score(&self, keyword: &str, left_word_boundaries: &Vec<usize>) -> u64 {
+    pub fn compute_kw_score(
+        path_component: &str,
+        keyword: &str,
+        left_word_boundaries: &Vec<usize>,
+        is_last_component: bool,
+    ) -> u64 {
         let keyword_lower = &keyword.to_lowercase();
-        let path_lower = self.path.to_lowercase();
+        let path_lower = path_component.to_lowercase();
 
         // more than one boundary can match
         let mut best_boundary_score = 0;
@@ -126,27 +147,32 @@ impl Dir<'_> {
             // TODO: think carefully about these rules. Should the case of the match
             // be allowed to influence the score? What if it's all lowercase, so
             // a smart case match is impossible?
-            let path = &self.path[*idx..];
-            let path_lower = &path_lower[*idx..];
-            if path.starts_with(keyword) {
+            let word = &path_component[*idx..];
+            let word_lower = &path_lower[*idx..];
+            if word.starts_with(keyword) {
                 // exact match
 
                 // TODO: think about checking the right word boundary, and give extra points if it matches.
                 //       Imagine two directories, src_3 and src. If src_3 is more frequently used, "sr" will
                 //       match src_3. But "src" will match src.
                 best_boundary_score = best_boundary_score.max(100);
-            } else if path_lower.starts_with(keyword) {
+            } else if word_lower.starts_with(keyword) {
                 // smart case match
                 best_boundary_score = best_boundary_score.max(90);
-            } else if path_lower.starts_with(keyword_lower) {
+            } else if word_lower.starts_with(keyword_lower) {
                 // wrong case but it's a match otherwise
                 best_boundary_score = best_boundary_score.max(20);
+            } else {
+                // No score. We don't need to give any score for a keyword that matches but not on a word boundary--
+                // All paths being checked should at least match in that way.
+                // But note that though the path will match the keyword, this path component may not match.
             }
+        }
 
-            // We don't need to give any score for a keyword that matches but not on a word boundary--
-            // All paths being checked should at least match in that way.
+        if best_boundary_score > 0 && is_last_component {
+            // matches in the last path component should be considered a little better
+            best_boundary_score += 5;
         }
-        debug_assert!(path_lower.contains(keyword_lower));
 
         best_boundary_score
     }

From 8d59bd5530f99babac017dd9368d5e7662875305 Mon Sep 17 00:00:00 2001
From: Dan Zwell <devel@zwell.net>
Date: Sat, 27 Nov 2021 11:09:35 +0800
Subject: [PATCH 6/7] Consider leftmost matching within a component to be best

So when searching 'd', ~/documents matches better than ~/my-documents.
---
 src/db/dir.rs | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/db/dir.rs b/src/db/dir.rs
index 29bca175..0fd6a1d2 100644
--- a/src/db/dir.rs
+++ b/src/db/dir.rs
@@ -150,12 +150,14 @@ impl Dir<'_> {
             let word = &path_component[*idx..];
             let word_lower = &path_lower[*idx..];
             if word.starts_with(keyword) {
-                // exact match
+                // exact match, but even better if it's at the leftmost position in the component,
+                // like "D" matching $HOME/Documents
+                let score = if *idx == 0 { 105 } else { 100 };
 
                 // TODO: think about checking the right word boundary, and give extra points if it matches.
                 //       Imagine two directories, src_3 and src. If src_3 is more frequently used, "sr" will
                 //       match src_3. But "src" will match src.
-                best_boundary_score = best_boundary_score.max(100);
+                best_boundary_score = best_boundary_score.max(score);
             } else if word_lower.starts_with(keyword) {
                 // smart case match
                 best_boundary_score = best_boundary_score.max(90);

From a260d922ed92090f64b9849e4e282ab2ccd11ffd Mon Sep 17 00:00:00 2001
From: Dan Zwell <devel@zwell.net>
Date: Sat, 27 Nov 2021 11:32:33 +0800
Subject: [PATCH 7/7] Implement smart case matching

A future option should be to turn off smart case, but that's not a priority,
for the reasons mentioned here: https://github.com/ajeetdsouza/zoxide/issues/224
---
 src/db/dir.rs | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/src/db/dir.rs b/src/db/dir.rs
index 0fd6a1d2..8ff819d0 100644
--- a/src/db/dir.rs
+++ b/src/db/dir.rs
@@ -114,7 +114,9 @@ impl Dir<'_> {
 
         let mut kw_score_sum = 0;
 
-        // Split the path into components, then words, so the "M" can be a better match
+        let smart_case = keywords.iter().all(|kw| &kw.to_lowercase() == kw);
+
+        // Split the path into components, then words, so the "m" can be a better match
         // for "folk music" than for "tom", and the best match for "music".
         // And even more so if it's the last path component.
         let path = PathBuf::from_str(&self.path).unwrap(); // safe because error is Infallible
@@ -122,9 +124,12 @@ impl Dir<'_> {
         let mut is_last_component = true;
         for component in path_components.rev() {
             let component = component.as_os_str().to_str().unwrap(); // safe because the path came from a string
+            let component = if smart_case { component.to_lowercase() } else { component.to_owned() };
+
             let left_word_boundaries = left_word_boundaries(&component);
             for keyword in keywords {
-                kw_score_sum += Self::compute_kw_score(&component, keyword, &left_word_boundaries, is_last_component);
+                kw_score_sum +=
+                    Self::compute_kw_score(&component, keyword, &left_word_boundaries, smart_case, is_last_component);
             }
             is_last_component = false;
         }
@@ -136,6 +141,7 @@ impl Dir<'_> {
         path_component: &str,
         keyword: &str,
         left_word_boundaries: &Vec<usize>,
+        smart_case: bool,
         is_last_component: bool,
     ) -> u64 {
         let keyword_lower = &keyword.to_lowercase();
@@ -158,9 +164,10 @@ impl Dir<'_> {
                 //       Imagine two directories, src_3 and src. If src_3 is more frequently used, "sr" will
                 //       match src_3. But "src" will match src.
                 best_boundary_score = best_boundary_score.max(score);
-            } else if word_lower.starts_with(keyword) {
-                // smart case match
-                best_boundary_score = best_boundary_score.max(90);
+            } else if !smart_case && word_lower.starts_with(keyword) {
+                // smart case is off (a keyword has case), but this keyword alone would be a smart case match
+                // for the component.
+                best_boundary_score = best_boundary_score.max(25);
             } else if word_lower.starts_with(keyword_lower) {
                 // wrong case but it's a match otherwise
                 best_boundary_score = best_boundary_score.max(20);