From b43f0877f83e832d23fa5c4a1292becc5b73665a Mon Sep 17 00:00:00 2001 From: Liam Bigelow <40188355+bglw@users.noreply.github.com> Date: Wed, 6 Sep 2023 12:59:50 +1200 Subject: [PATCH] Index emoji in and around words --- Cargo.lock | 33 ++++++++++++- pagefind/Cargo.toml | 2 + pagefind/features/characters.feature | 28 +++++++++++ pagefind/src/fossick/mod.rs | 12 +++-- pagefind/src/fossick/splitting.rs | 72 ++++++++++++++++++++++++---- 5 files changed, 134 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 1024689c..d00ba2b1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -771,6 +771,15 @@ version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" +[[package]] +name = "emojis" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ee61eb945bff65ee7d19d157d39c67c33290ff0742907413fd5eefd29edc979" +dependencies = [ + "phf 0.11.2", +] + [[package]] name = "encode_unicode" version = "0.3.6" @@ -1812,6 +1821,7 @@ dependencies = [ "clap 4.1.11", "console", "convert_case 0.6.0", + "emojis", "flate2", "futures", "hashbrown 0.13.1", @@ -1832,6 +1842,7 @@ dependencies = [ "sha-1", "tokio", "twelf", + "unicode-segmentation", "wax", ] @@ -1917,6 +1928,15 @@ dependencies = [ "phf_shared 0.10.0", ] +[[package]] +name = "phf" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc" +dependencies = [ + "phf_shared 0.11.2", +] + [[package]] name = "phf_codegen" version = "0.8.0" @@ -1989,6 +2009,15 @@ dependencies = [ "siphasher", ] +[[package]] +name = "phf_shared" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b" +dependencies = [ + "siphasher", +] + [[package]] name = "pin-project-lite" version = "0.2.9" @@ -2682,9 +2711,9 @@ dependencies = [ [[package]] name = "unicode-segmentation" -version = "1.10.0" +version = "1.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a" +checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36" [[package]] name = "unicode-width" diff --git a/pagefind/Cargo.toml b/pagefind/Cargo.toml index 205267ea..5c7d6c56 100644 --- a/pagefind/Cargo.toml +++ b/pagefind/Cargo.toml @@ -50,6 +50,8 @@ pagefind_stem = { version = "0.2.0", features = [ ] } convert_case = "0.6.0" charabia = { version = "0.7.0", optional = true } +unicode-segmentation = "1.10.1" +emojis = "0.6.1" hashbrown = { version = "0.13.1", features = ["serde"] } regex = "1.1" minicbor = { version = "0.19.1", features = ["alloc", "derive"] } diff --git a/pagefind/features/characters.feature b/pagefind/features/characters.feature index 8d48d861..bac06bd1 100644 --- a/pagefind/features/characters.feature +++ b/pagefind/features/characters.feature @@ -31,6 +31,34 @@ Feature: Character Tests Then There should be no logs Then The selector "[data-result]" should contain "/apiary/" + Scenario: Pagefind matches emoji + Given I have a "public/fam-seperate/index.html" file with the body: + """ +

Fam πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦

+ """ + Given I have a "public/fam-middled/index.html" file with the body: + """ +

FπŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦am

+ """ + When I run my program + Then I should see "Running Pagefind" in stdout + Then I should see the file "public/pagefind/pagefind.js" + When I serve the "public" directory + When I load "/" + When I evaluate: + """ + async function() { + let pagefind = await import("/pagefind/pagefind.js"); + + let search = await pagefind.search("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦"); + + let pages = await Promise.all(search.results.map(r => r.data())); + document.querySelector('[data-result]').innerText = pages.map(p => p.url).sort().join(", "); + } + """ + Then There should be no logs + Then The selector "[data-result]" should contain "/fam-middled/, /fam-seperate/" + Scenario: Pagefind doesn't match HTML entities as their text Given I have a "public/apiary/index.html" file with the body: """ diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs index 7c9fd60a..9d35ea00 100644 --- a/pagefind/src/fossick/mod.rs +++ b/pagefind/src/fossick/mod.rs @@ -332,13 +332,19 @@ impl Fossicker { // For words that may be CompoundWords, also index them as their constituent parts if normalized_word != word { - let parts = get_discrete_words(word); + let (word_parts, extras) = get_discrete_words(word); // Only proceed if the word was broken into multiple parts - if parts.contains(|c: char| c.is_whitespace()) { - for part_word in parts.split_whitespace() { + if word_parts.contains(|c: char| c.is_whitespace()) { + for part_word in word_parts.split_whitespace() { store_word(part_word, word_index, *word_weight); } } + // Additionally store any special extra characters we are given + if let Some(extras) = extras { + for extra in extras { + store_word(&extra, word_index, *word_weight); + } + } } } if content.ends_with(' ') { diff --git a/pagefind/src/fossick/splitting.rs b/pagefind/src/fossick/splitting.rs index 71bbfc8b..a3275772 100644 --- a/pagefind/src/fossick/splitting.rs +++ b/pagefind/src/fossick/splitting.rs @@ -1,9 +1,38 @@ use convert_case::{Case, Casing}; +use emojis; +use lazy_static::lazy_static; +use regex::Regex; +use unicode_segmentation::UnicodeSegmentation; -pub fn get_discrete_words>(s: S) -> String { - s.as_ref() +lazy_static! { + static ref EMOJI: Regex = Regex::new("\\p{Emoji}").unwrap(); +} + +pub fn get_discrete_words>(s: S) -> (String, Option>) { + let mut extras = None; + + let words = s + .as_ref() .replace(|c| c == '.' || c == ',' || c == '/' || c == ':', " ") - .to_case(Case::Lower) + .to_case(Case::Lower); + + if EMOJI.is_match(s.as_ref()) { + extras = Some( + s.as_ref() + .graphemes(true) + .into_iter() + .filter_map(|x| { + if emojis::get(x).is_some() { + Some(x.to_string()) + } else { + None + } + }) + .collect::>(), + ); + } + + (words, extras) } #[cfg(test)] @@ -13,30 +42,57 @@ mod tests { #[test] fn hyphenated_words() { let input = "these-words-are-hyphenated"; - assert_eq!(get_discrete_words(input), "these words are hyphenated"); + assert_eq!( + get_discrete_words(input), + ("these words are hyphenated".into(), None) + ); } #[test] fn underscored_words() { let input = "__array_structures"; - assert_eq!(get_discrete_words(input), "array structures"); + assert_eq!(get_discrete_words(input), ("array structures".into(), None)); } #[test] fn camel_words() { let input = "WKWebVIEWComponent"; - assert_eq!(get_discrete_words(input), "wk web view component"); + assert_eq!( + get_discrete_words(input), + ("wk web view component".into(), None) + ); } #[test] fn dotted_words() { let input = "page.Find"; - assert_eq!(get_discrete_words(input), "page find"); + assert_eq!(get_discrete_words(input), ("page find".into(), None)); } #[test] fn misc_punctuation() { let input = "cloud/cannon,page.find"; - assert_eq!(get_discrete_words(input), "cloud cannon page find"); + assert_eq!( + get_discrete_words(input), + ("cloud cannon page find".into(), None) + ); + } + + #[test] + fn emoji() { + let input = "cloud🌦️cannon"; + assert_eq!( + get_discrete_words(input), + ("cloud🌦️cannon".into(), Some(vec!["🌦️".into()])) + ); + + let input = "πŸ‘‹πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦πŸŒΎ"; + assert_eq!( + get_discrete_words(input), + ( + "πŸ‘‹πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦πŸŒΎ".into(), + Some(vec!["πŸ‘‹".into(), "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦".into(), "🌾".into()]) + ) + ); } }