Implement smart indexing of compound words

CloudCannon · Aug 30, 2023 · 980ec87 · 980ec87
1 parent 06de164
commit 980ec87
Show file tree

Hide file tree

Showing 6 changed files with 243 additions and 19 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/pagefind/Cargo.toml b/pagefind/Cargo.toml
@@ -48,6 +48,7 @@ pagefind_stem = { version = "0.2.0", features = [
     "turkish",
     "yiddish",
 ] }
+convert_case = "0.6.0"
 charabia = { version = "0.7.0", optional = true }
 hashbrown = { version = "0.13.1", features = ["serde"] }
 regex = "1.1"

diff --git a/pagefind/features/characters.feature b/pagefind/features/characters.feature
@@ -104,3 +104,156 @@ Feature: Character Tests
             """
         Then There should be no logs
         Then The selector "[data-result]" should contain '[0]'
+
+    Scenario: Punctuated compound words are indexed per word
+        Given I have a "public/hyphen/index.html" file with the body:
+            """
+            <p>beet-root</p>
+            """
+        Given I have a "public/period/index.html" file with the body:
+            """
+            <p>image.png</p>
+            """
+        Given I have a "public/camel/index.html" file with the body:
+            """
+            <p>WKWebVIEWComponent</p>
+            """
+        Given I have a "public/underscore/index.html" file with the body:
+            """
+            <p>Word_Boundaries</p>
+            """
+        Given I have a "public/slash/index.html" file with the body:
+            """
+            <p>sandwich/salad</p>
+            """
+        Given I have a "public/comma/index.html" file with the body:
+            """
+            <p>Cloud,Cannon</p>
+            """
+        When I run my program
+        Then I should see "Running Pagefind" in stdout
+        Then I should see the file "public/_pagefind/pagefind.js"
+        When I serve the "public" directory
+        When I load "/"
+        When I evaluate:
+            """
+            async function() {
+                let pagefind = await import("/_pagefind/pagefind.js");
+
+                let pages = [
+                    ...(await Promise.all((await pagefind.search("beet")).results.map(r => r.data()))),
+                    ...(await Promise.all((await pagefind.search("root")).results.map(r => r.data()))),
+                ];
+
+                document.querySelector('[data-result]').innerText = pages.map(p => p.url).join(", ");
+            }
+            """
+        Then There should be no logs
+        Then The selector "[data-result]" should contain '/hyphen/, /hyphen/'
+        When I evaluate:
+            """
+            async function() {
+                let pagefind = await import("/_pagefind/pagefind.js");
+
+                let pages = [
+                    ...(await Promise.all((await pagefind.search("image")).results.map(r => r.data()))),
+                    ...(await Promise.all((await pagefind.search("png")).results.map(r => r.data()))),
+                ];
+
+                document.querySelector('[data-result]').innerText = pages.map(p => p.url).join(", ");
+            }
+            """
+        Then There should be no logs
+        Then The selector "[data-result]" should contain '/period/, /period/'
+        When I evaluate:
+            """
+            async function() {
+                let pagefind = await import("/_pagefind/pagefind.js");
+
+                let pages = [
+                    ...(await Promise.all((await pagefind.search("WkWebVIEWComponent")).results.map(r => r.data()))),
+                    ...(await Promise.all((await pagefind.search("web")).results.map(r => r.data()))),
+                    ...(await Promise.all((await pagefind.search("component")).results.map(r => r.data()))),
+                ];
+
+                document.querySelector('[data-result]').innerText = pages.map(p => p.url).join(", ");
+            }
+            """
+        Then There should be no logs
+        Then The selector "[data-result]" should contain '/camel/, /camel/, /camel/'
+        When I evaluate:
+            """
+            async function() {
+                let pagefind = await import("/_pagefind/pagefind.js");
+
+                let pages = [
+                    ...(await Promise.all((await pagefind.search("word")).results.map(r => r.data()))),
+                    ...(await Promise.all((await pagefind.search("bound")).results.map(r => r.data()))),
+                ];
+
+                document.querySelector('[data-result]').innerText = pages.map(p => p.url).join(", ");
+            }
+            """
+        Then There should be no logs
+        Then The selector "[data-result]" should contain '/underscore/, /underscore/'
+        When I evaluate:
+            """
+            async function() {
+                let pagefind = await import("/_pagefind/pagefind.js");
+
+                let pages = [
+                    ...(await Promise.all((await pagefind.search("sandwich")).results.map(r => r.data()))),
+                    ...(await Promise.all((await pagefind.search("salad")).results.map(r => r.data()))),
+                ];
+
+                document.querySelector('[data-result]').innerText = pages.map(p => p.url).join(", ");
+            }
+            """
+        Then There should be no logs
+        Then The selector "[data-result]" should contain '/slash/, /slash/'
+        When I evaluate:
+            """
+            async function() {
+                let pagefind = await import("/_pagefind/pagefind.js");
+
+                let pages = [
+                    ...(await Promise.all((await pagefind.search("CloudCannon")).results.map(r => r.data()))),
+                    ...(await Promise.all((await pagefind.search("cloud")).results.map(r => r.data()))),
+                    ...(await Promise.all((await pagefind.search("cannon")).results.map(r => r.data()))),
+                ];
+
+                document.querySelector('[data-result]').innerText = pages.map(p => p.url).join(", ");
+            }
+            """
+        Then There should be no logs
+        Then The selector "[data-result]" should contain '/comma/, /comma/, /comma/'
+
+    Scenario: Standard punctionation isn't indexed per word
+        Given I have a "public/standard/index.html" file with the body:
+            """
+            <p>not'anotherword</p>
+            <p>not(anotherword</p>
+            <p>not@anotherword</p>
+            <p>not#anotherword</p>
+            <p>not~anotherword</p>
+            <p>not+anotherword</p>
+            <p>not"anotherword"</p>
+            """
+        When I run my program
+        Then I should see "Running Pagefind" in stdout
+        Then I should see the file "public/_pagefind/pagefind.js"
+        When I serve the "public" directory
+        When I load "/"
+        When I evaluate:
+            """
+            async function() {
+                let pagefind = await import("/_pagefind/pagefind.js");
+
+                let search = await pagefind.search("anotherword");
+
+                let pages = await Promise.all(search.results.map(r => r.data()));
+                document.querySelector('[data-result]').innerText = `${pages.length} result(s)`;
+            }
+            """
+        Then There should be no logs
+        Then The selector "[data-result]" should contain '0 result(s)'
diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs
@@ -18,6 +18,7 @@ use crate::SearchOptions;
 use parser::DomParser;
 
 use self::parser::DomParserResult;
+use self::splitting::get_discrete_words;
 
 lazy_static! {
     static ref NEWLINES: Regex = Regex::new("(\n|\r\n)+").unwrap();
@@ -28,6 +29,7 @@ lazy_static! {
 }
 
 pub mod parser;
+mod splitting;
 
 #[derive(Debug, Clone, PartialEq)]
 pub struct FossickedWord {
@@ -202,6 +204,24 @@ impl Fossicker {
 
         let mut content = String::with_capacity(data.digest.len());
 
+        let mut store_word = |full_word: &str, word_index: usize, word_weight: u8| {
+            let word = if let Some(stemmer) = &stemmer {
+                stemmer.stem(&full_word).into_owned()
+            } else {
+                full_word.to_string()
+            };
+
+            let entry = FossickedWord {
+                position: word_index.try_into().unwrap(),
+                weight: word_weight,
+            };
+            if let Some(repeat) = map.get_mut(&word) {
+                repeat.push(entry);
+            } else {
+                map.insert(word, vec![entry]);
+            }
+        };
+
         // TODO: Consider reading newlines and jump the word_index up some amount,
         // so that separate bodies of text don't return exact string
         // matches across the boundaries. Or otherwise use some marker byte for the boundary.
@@ -295,30 +315,28 @@ impl Fossicker {
 
             content.push_str(&word.replace('\u{200B}', ""));
             content.push(' ');
-
-            let mut normalized_word = SPECIAL_CHARS
-                .replace_all(word, "")
-                .into_owned()
-                .to_lowercase();
-
             #[cfg(feature = "extended")]
             if should_segment {
                 content.push('\u{200B}');
             }
 
+            let normalized_word = SPECIAL_CHARS
+                .replace_all(word, "")
+                .into_owned()
+                .to_lowercase();
+
             if !normalized_word.is_empty() {
-                if let Some(stemmer) = &stemmer {
-                    normalized_word = stemmer.stem(&normalized_word).into_owned();
-                }
+                store_word(&normalized_word, word_index, *word_weight);
+            }
 
-                let entry = FossickedWord {
-                    position: word_index.try_into().unwrap(),
-                    weight: *word_weight,
-                };
-                if let Some(repeat) = map.get_mut(&normalized_word) {
-                    repeat.push(entry);
-                } else {
-                    map.insert(normalized_word, vec![entry]);
+            // For words that may be CompoundWords, also index them as their constituent parts
+            if normalized_word != word {
+                let parts = get_discrete_words(word);
+                // Only proceed if the word was broken into multiple parts
+                if parts.contains(|c: char| c.is_whitespace()) {
+                    for part_word in parts.split_whitespace() {
+                        store_word(part_word, word_index, *word_weight);
+                    }
                 }
             }
         }

diff --git a/pagefind/src/fossick/splitting.rs b/pagefind/src/fossick/splitting.rs
@@ -0,0 +1,42 @@
+use convert_case::{Case, Casing};
+
+pub fn get_discrete_words<S: AsRef<str>>(s: S) -> String {
+    s.as_ref()
+        .replace(|c| c == '.' || c == ',' || c == '/' || c == ':', " ")
+        .to_case(Case::Lower)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn hyphenated_words() {
+        let input = "these-words-are-hyphenated";
+        assert_eq!(get_discrete_words(input), "these words are hyphenated");
+    }
+
+    #[test]
+    fn underscored_words() {
+        let input = "__array_structures";
+        assert_eq!(get_discrete_words(input), "array structures");
+    }
+
+    #[test]
+    fn camel_words() {
+        let input = "WKWebVIEWComponent";
+        assert_eq!(get_discrete_words(input), "wk web view component");
+    }
+
+    #[test]
+    fn dotted_words() {
+        let input = "page.Find";
+        assert_eq!(get_discrete_words(input), "page find");
+    }
+
+    #[test]
+    fn misc_punctuation() {
+        let input = "cloud/cannon,page.find";
+        assert_eq!(get_discrete_words(input), "cloud cannon page find");
+    }
+}
diff --git a/pagefind/src/logging.rs b/pagefind/src/logging.rs
@@ -69,7 +69,7 @@ impl Logger {
 
         Self {
             log_level,
-            out: if (use_terminal) {
+            out: if use_terminal {
                 Some(Term::stdout())
             } else {
                 None