Skip to content

Commit

Permalink
Implement smart indexing of compound words
Browse files Browse the repository at this point in the history
  • Loading branch information
bglw committed Aug 30, 2023
1 parent 06de164 commit 980ec87
Show file tree
Hide file tree
Showing 6 changed files with 243 additions and 19 deletions.
12 changes: 11 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pagefind/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ pagefind_stem = { version = "0.2.0", features = [
"turkish",
"yiddish",
] }
convert_case = "0.6.0"
charabia = { version = "0.7.0", optional = true }
hashbrown = { version = "0.13.1", features = ["serde"] }
regex = "1.1"
Expand Down
153 changes: 153 additions & 0 deletions pagefind/features/characters.feature
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,156 @@ Feature: Character Tests
"""
Then There should be no logs
Then The selector "[data-result]" should contain '[0]'

Scenario: Punctuated compound words are indexed per word
Given I have a "public/hyphen/index.html" file with the body:
"""
<p>beet-root</p>
"""
Given I have a "public/period/index.html" file with the body:
"""
<p>image.png</p>
"""
Given I have a "public/camel/index.html" file with the body:
"""
<p>WKWebVIEWComponent</p>
"""
Given I have a "public/underscore/index.html" file with the body:
"""
<p>Word_Boundaries</p>
"""
Given I have a "public/slash/index.html" file with the body:
"""
<p>sandwich/salad</p>
"""
Given I have a "public/comma/index.html" file with the body:
"""
<p>Cloud,Cannon</p>
"""
When I run my program
Then I should see "Running Pagefind" in stdout
Then I should see the file "public/_pagefind/pagefind.js"
When I serve the "public" directory
When I load "/"
When I evaluate:
"""
async function() {
let pagefind = await import("/_pagefind/pagefind.js");
let pages = [
...(await Promise.all((await pagefind.search("beet")).results.map(r => r.data()))),
...(await Promise.all((await pagefind.search("root")).results.map(r => r.data()))),
];
document.querySelector('[data-result]').innerText = pages.map(p => p.url).join(", ");
}
"""
Then There should be no logs
Then The selector "[data-result]" should contain '/hyphen/, /hyphen/'
When I evaluate:
"""
async function() {
let pagefind = await import("/_pagefind/pagefind.js");
let pages = [
...(await Promise.all((await pagefind.search("image")).results.map(r => r.data()))),
...(await Promise.all((await pagefind.search("png")).results.map(r => r.data()))),
];
document.querySelector('[data-result]').innerText = pages.map(p => p.url).join(", ");
}
"""
Then There should be no logs
Then The selector "[data-result]" should contain '/period/, /period/'
When I evaluate:
"""
async function() {
let pagefind = await import("/_pagefind/pagefind.js");
let pages = [
...(await Promise.all((await pagefind.search("WkWebVIEWComponent")).results.map(r => r.data()))),
...(await Promise.all((await pagefind.search("web")).results.map(r => r.data()))),
...(await Promise.all((await pagefind.search("component")).results.map(r => r.data()))),
];
document.querySelector('[data-result]').innerText = pages.map(p => p.url).join(", ");
}
"""
Then There should be no logs
Then The selector "[data-result]" should contain '/camel/, /camel/, /camel/'
When I evaluate:
"""
async function() {
let pagefind = await import("/_pagefind/pagefind.js");
let pages = [
...(await Promise.all((await pagefind.search("word")).results.map(r => r.data()))),
...(await Promise.all((await pagefind.search("bound")).results.map(r => r.data()))),
];
document.querySelector('[data-result]').innerText = pages.map(p => p.url).join(", ");
}
"""
Then There should be no logs
Then The selector "[data-result]" should contain '/underscore/, /underscore/'
When I evaluate:
"""
async function() {
let pagefind = await import("/_pagefind/pagefind.js");
let pages = [
...(await Promise.all((await pagefind.search("sandwich")).results.map(r => r.data()))),
...(await Promise.all((await pagefind.search("salad")).results.map(r => r.data()))),
];
document.querySelector('[data-result]').innerText = pages.map(p => p.url).join(", ");
}
"""
Then There should be no logs
Then The selector "[data-result]" should contain '/slash/, /slash/'
When I evaluate:
"""
async function() {
let pagefind = await import("/_pagefind/pagefind.js");
let pages = [
...(await Promise.all((await pagefind.search("CloudCannon")).results.map(r => r.data()))),
...(await Promise.all((await pagefind.search("cloud")).results.map(r => r.data()))),
...(await Promise.all((await pagefind.search("cannon")).results.map(r => r.data()))),
];
document.querySelector('[data-result]').innerText = pages.map(p => p.url).join(", ");
}
"""
Then There should be no logs
Then The selector "[data-result]" should contain '/comma/, /comma/, /comma/'

Scenario: Standard punctionation isn't indexed per word
Given I have a "public/standard/index.html" file with the body:
"""
<p>not'anotherword</p>
<p>not(anotherword</p>
<p>not@anotherword</p>
<p>not#anotherword</p>
<p>not~anotherword</p>
<p>not+anotherword</p>
<p>not"anotherword"</p>
"""
When I run my program
Then I should see "Running Pagefind" in stdout
Then I should see the file "public/_pagefind/pagefind.js"
When I serve the "public" directory
When I load "/"
When I evaluate:
"""
async function() {
let pagefind = await import("/_pagefind/pagefind.js");
let search = await pagefind.search("anotherword");
let pages = await Promise.all(search.results.map(r => r.data()));
document.querySelector('[data-result]').innerText = `${pages.length} result(s)`;
}
"""
Then There should be no logs
Then The selector "[data-result]" should contain '0 result(s)'
52 changes: 35 additions & 17 deletions pagefind/src/fossick/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ use crate::SearchOptions;
use parser::DomParser;

use self::parser::DomParserResult;
use self::splitting::get_discrete_words;

lazy_static! {
static ref NEWLINES: Regex = Regex::new("(\n|\r\n)+").unwrap();
Expand All @@ -28,6 +29,7 @@ lazy_static! {
}

pub mod parser;
mod splitting;

#[derive(Debug, Clone, PartialEq)]
pub struct FossickedWord {
Expand Down Expand Up @@ -202,6 +204,24 @@ impl Fossicker {

let mut content = String::with_capacity(data.digest.len());

let mut store_word = |full_word: &str, word_index: usize, word_weight: u8| {
let word = if let Some(stemmer) = &stemmer {
stemmer.stem(&full_word).into_owned()
} else {
full_word.to_string()
};

let entry = FossickedWord {
position: word_index.try_into().unwrap(),
weight: word_weight,
};
if let Some(repeat) = map.get_mut(&word) {
repeat.push(entry);
} else {
map.insert(word, vec![entry]);
}
};

// TODO: Consider reading newlines and jump the word_index up some amount,
// so that separate bodies of text don't return exact string
// matches across the boundaries. Or otherwise use some marker byte for the boundary.
Expand Down Expand Up @@ -295,30 +315,28 @@ impl Fossicker {

content.push_str(&word.replace('\u{200B}', ""));
content.push(' ');

let mut normalized_word = SPECIAL_CHARS
.replace_all(word, "")
.into_owned()
.to_lowercase();

#[cfg(feature = "extended")]
if should_segment {
content.push('\u{200B}');
}

let normalized_word = SPECIAL_CHARS
.replace_all(word, "")
.into_owned()
.to_lowercase();

if !normalized_word.is_empty() {
if let Some(stemmer) = &stemmer {
normalized_word = stemmer.stem(&normalized_word).into_owned();
}
store_word(&normalized_word, word_index, *word_weight);
}

let entry = FossickedWord {
position: word_index.try_into().unwrap(),
weight: *word_weight,
};
if let Some(repeat) = map.get_mut(&normalized_word) {
repeat.push(entry);
} else {
map.insert(normalized_word, vec![entry]);
// For words that may be CompoundWords, also index them as their constituent parts
if normalized_word != word {
let parts = get_discrete_words(word);
// Only proceed if the word was broken into multiple parts
if parts.contains(|c: char| c.is_whitespace()) {
for part_word in parts.split_whitespace() {
store_word(part_word, word_index, *word_weight);
}
}
}
}
Expand Down
42 changes: 42 additions & 0 deletions pagefind/src/fossick/splitting.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
use convert_case::{Case, Casing};

pub fn get_discrete_words<S: AsRef<str>>(s: S) -> String {
s.as_ref()
.replace(|c| c == '.' || c == ',' || c == '/' || c == ':', " ")
.to_case(Case::Lower)
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn hyphenated_words() {
let input = "these-words-are-hyphenated";
assert_eq!(get_discrete_words(input), "these words are hyphenated");
}

#[test]
fn underscored_words() {
let input = "__array_structures";
assert_eq!(get_discrete_words(input), "array structures");
}

#[test]
fn camel_words() {
let input = "WKWebVIEWComponent";
assert_eq!(get_discrete_words(input), "wk web view component");
}

#[test]
fn dotted_words() {
let input = "page.Find";
assert_eq!(get_discrete_words(input), "page find");
}

#[test]
fn misc_punctuation() {
let input = "cloud/cannon,page.find";
assert_eq!(get_discrete_words(input), "cloud cannon page find");
}
}
2 changes: 1 addition & 1 deletion pagefind/src/logging.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ impl Logger {

Self {
log_level,
out: if (use_terminal) {
out: if use_terminal {
Some(Term::stdout())
} else {
None
Expand Down

0 comments on commit 980ec87

Please sign in to comment.