Skip to content

Commit

Permalink
Use word boundary detection and give scores based on keyword matching
Browse files Browse the repository at this point in the history
I'm using a helper library to implement a unicode algorithm, but I'm also
detecting case changes within a word (from lower to upper case, or no case
to some case, so the "o" in "Documents" doesn't count as a new word).

Words are not searched--rather, the string is searched starting at a word
boundary. That way multi-word sequences will correctly match.

This is a basic solution to #260.

Some things to consider:
- We don't have options to control the case. If smart-case is disabled,
  the weights in compute_kw_score need to change.
- Right now keyword score totally overpowers the frequency score. The
  frequency score is only a tie-breaker. They could be normalized and
  weighted so a much better frequency score would win despite a slightly
  worse keyword score.
- Should we detect word endings for exact matches? I'm not sure it would
  give a good user experience. If I frequently access "src9" but not "src",
  I don't want "src" to win just because it more exactly matches what I
  typed. It's hard to refrain from typing a whole word. This gives an
  interesting wrong result with "c" being a perfect match for
  "/mnt/c/anything".
- The above issue can be solved if we consider digits to start a new word.
- I'm testing these changes with `cargo r -- query --list --score b`.
  • Loading branch information
lefth committed Sep 17, 2021
1 parent ec3ad20 commit d6f5db0
Show file tree
Hide file tree
Showing 5 changed files with 138 additions and 14 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ glob = "0.3.0"
ordered-float = "2.0.0"
serde = { version = "1.0.116", features = ["derive"] }
tempfile = "3.1.0"
unicode-segmentation = "1.8.0"

[target.'cfg(windows)'.dependencies]
rand = { version = "0.8.4", features = [
Expand Down
6 changes: 3 additions & 3 deletions src/app/query.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ impl Query {
if self.interactive {
let mut fzf = Fzf::new(false)?;
while let Some(dir) = stream.next() {
writeln!(fzf.stdin(), "{}", dir.display_score(now)).pipe_exit("fzf")?;
writeln!(fzf.stdin(), "{}", dir.display_score(now, Some(&self.keywords))).pipe_exit("fzf")?;
}

let selection = fzf.wait_select()?;
Expand All @@ -49,7 +49,7 @@ impl Query {
let handle = &mut stdout.lock();
while let Some(dir) = stream.next() {
if self.score {
writeln!(handle, "{}", dir.display_score(now))
writeln!(handle, "{}", dir.display_score(now, Some(&self.keywords)))
} else {
writeln!(handle, "{}", dir.display())
}
Expand All @@ -59,7 +59,7 @@ impl Query {
} else {
let dir = stream.next().context("no match found")?;
if self.score {
writeln!(io::stdout(), "{}", dir.display_score(now))
writeln!(io::stdout(), "{}", dir.display_score(now, Some(&self.keywords)))
} else {
writeln!(io::stdout(), "{}", dir.display())
}
Expand Down
2 changes: 1 addition & 1 deletion src/app/remove.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ impl Run for Remove {

let mut fzf = Fzf::new(true)?;
while let Some(dir) = stream.next() {
writeln!(fzf.stdin(), "{}", dir.display_score(now)).pipe_exit("fzf")?;
writeln!(fzf.stdin(), "{}", dir.display_score(now, Some(keywords))).pipe_exit("fzf")?;
}

selection = fzf.wait_select()?;
Expand Down
142 changes: 132 additions & 10 deletions src/db/dir.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use std::ops::{Deref, DerefMut};
use anyhow::{bail, Context, Result};
use bincode::Options as _;
use serde::{Deserialize, Serialize};
use unicode_segmentation::UnicodeSegmentation;

#[derive(Debug, Deserialize, Serialize, Default)]
pub struct DirList<'a>(#[serde(borrow)] pub Vec<Dir<'a>>);
Expand Down Expand Up @@ -88,7 +89,7 @@ pub struct Dir<'a> {
}

impl Dir<'_> {
pub fn score(&self, now: Epoch, _keywords: &Vec<String>) -> Score {
pub fn score(&self, now: Epoch, keywords: &Vec<String>) -> Score {
const HOUR: Epoch = 60 * 60;
const DAY: Epoch = 24 * HOUR;
const WEEK: Epoch = 7 * DAY;
Expand All @@ -105,22 +106,111 @@ impl Dir<'_> {
self.rank * 0.25
};

// TODO: incorporate keywords into the scoring logic, so match quality is more significant
// than the access date. See issue #260.
let kw_score = 1;
let left_word_boundaries = left_word_boundaries(&self.path);

(kw_score, adjusted_rank)
let mut kw_score_sum = 0;
for keyword in keywords {
kw_score_sum += self.compute_kw_score(keyword, &left_word_boundaries);
}

(kw_score_sum, adjusted_rank)
}

pub fn compute_kw_score(&self, keyword: &str, left_word_boundaries: &Vec<usize>) -> u64 {
let keyword_lower = &keyword.to_lowercase();
let path_lower = self.path.to_lowercase();

// more than one boundary can match
let mut best_boundary_score = 0;
for idx in left_word_boundaries {
// TODO: think carefully about these rules. Should the case of the match
// be allowed to influence the score? What if it's all lowercase, so
// a smart case match is impossible?
let path = &self.path[*idx..];
let path_lower = &path_lower[*idx..];
if path.starts_with(keyword) {
// exact match

// TODO: think about checking the right word boundary, and give extra points if it matches.
// Imagine two directories, src_3 and src. If src_3 is more frequently used, "sr" will
// match src_3. But "src" will match src.
best_boundary_score = best_boundary_score.max(100);
} else if path_lower.starts_with(keyword) {
// smart case match
best_boundary_score = best_boundary_score.max(90);
} else if path_lower.starts_with(keyword_lower) {
// wrong case but it's a match otherwise
best_boundary_score = best_boundary_score.max(20);
}

// We don't need to give any score for a keyword that matches but not on a word boundary--
// All paths being checked should at least match in that way.
}
debug_assert!(path_lower.contains(keyword_lower));

best_boundary_score
}

pub fn display(&self) -> DirDisplay {
DirDisplay { dir: self }
}

pub fn display_score(&self, now: Epoch) -> DirDisplayScore {
DirDisplayScore { dir: self, now }
pub fn display_score(&self, now: Epoch, keywords: Option<&Vec<String>>) -> DirDisplayScore {
DirDisplayScore { dir: self, now, keywords: keywords.map(|vec| vec.iter().cloned().collect()) }
}
}

/// Returns byte indices that correspond to the leftmost position of each word.
/// For input "hi there", the result will contain 0 and 3.
///
/// The result may also contain extraneous indices.
fn left_word_boundaries(text: &str) -> Vec<usize> {
let mut boundaries = Vec::new();

#[derive(PartialEq, Clone, Copy, PartialOrd)]
enum Case {
None,
LowerCase,
UpperCase,
}

// We won't need the words themselves because we want to do multi-word match.
// We need the whole string for that.
for (word_idx, word) in text.unicode_word_indices() {
boundaries.push(word_idx);

// Also search for case changes, and non-text characters:
// MyDocuments
// my_documents
// TODO: should "clap3b4" count as 4 words or 1?
let mut prev_case = None;
for (grapheme_idx, grapheme) in word.grapheme_indices(true) {
let lower = grapheme.to_lowercase();
let upper = grapheme.to_uppercase();
let case = if lower == grapheme && upper == grapheme {
Case::None
} else if lower == grapheme {
Case::LowerCase
} else {
// Assume the other cases are upper case, because there might be more than
// one way to represent upper case
Case::UpperCase
};

if let Some(prev_case) = &prev_case {
if case > *prev_case {
// Consider this a word start if going from no case to any case,
// or lower case to upper case.
boundaries.push(word_idx + grapheme_idx);
}
}
let _ = prev_case.replace(case);
}
}

boundaries
}

pub struct DirDisplay<'a> {
dir: &'a Dir<'a>,
}
Expand All @@ -134,11 +224,15 @@ impl Display for DirDisplay<'_> {
pub struct DirDisplayScore<'a> {
dir: &'a Dir<'a>,
now: Epoch,
keywords: Option<Vec<String>>,
}

impl Display for DirDisplayScore<'_> {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
let (kw_score, score) = self.dir.score(self.now, &vec![]);
let no_keywords = Vec::default();
let keywords = self.keywords.as_ref().unwrap_or(&no_keywords);

let (kw_score, score) = self.dir.score(self.now, keywords);
let score = if score > 9999.0 {
9999
} else if score > 0.0 {
Expand All @@ -156,9 +250,9 @@ pub type Epoch = u64;

#[cfg(test)]
mod tests {
use std::borrow::Cow;
use std::{borrow::Cow, collections::HashSet};

use super::{Dir, DirList};
use super::{left_word_boundaries, Dir, DirList};

#[test]
fn zero_copy() {
Expand All @@ -171,4 +265,32 @@ mod tests {
assert!(matches!(dir.path, Cow::Borrowed(_)))
}
}

#[test]
fn test_left_word_boundaries() {
assert!(left_word_boundaries("") == vec![]);
assert!(left_word_boundaries("Hi") == vec![0]);

assert!(vec![0, 3]
.into_iter()
.collect::<HashSet<_>>()
.is_subset(&left_word_boundaries("hi there").into_iter().collect()));
assert!(vec![0, 3]
.into_iter()
.collect::<HashSet<_>>()
.is_subset(&left_word_boundaries("hi_there").into_iter().collect()));

assert!(vec![0, 4] == left_word_boundaries("FürElise"));
assert!(vec![0, 1] == left_word_boundaries("uTorrent"));
assert!(vec![0, 2] == left_word_boundaries("µTorrent"));

assert!(vec![1, 6, 11]
.into_iter()
.collect::<HashSet<_>>()
.is_subset(&left_word_boundaries("/path/file.ext").into_iter().collect()));
assert!(vec![0, 3, 8, 13]
.into_iter()
.collect::<HashSet<_>>()
.is_subset(&left_word_boundaries(r"C:\path\file.ext").into_iter().collect()));
}
}

0 comments on commit d6f5db0

Please sign in to comment.