From b43f0877f83e832d23fa5c4a1292becc5b73665a Mon Sep 17 00:00:00 2001
From: Liam Bigelow <40188355+bglw@users.noreply.github.com>
Date: Wed, 6 Sep 2023 12:59:50 +1200
Subject: [PATCH] Index emoji in and around words

---
 Cargo.lock                           | 33 ++++++++++++-
 pagefind/Cargo.toml                  |  2 +
 pagefind/features/characters.feature | 28 +++++++++++
 pagefind/src/fossick/mod.rs          | 12 +++--
 pagefind/src/fossick/splitting.rs    | 72 ++++++++++++++++++++++++----
 5 files changed, 134 insertions(+), 13 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 1024689c..d00ba2b1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -771,6 +771,15 @@ version = "1.8.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797"
 
+[[package]]
+name = "emojis"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ee61eb945bff65ee7d19d157d39c67c33290ff0742907413fd5eefd29edc979"
+dependencies = [
+ "phf 0.11.2",
+]
+
 [[package]]
 name = "encode_unicode"
 version = "0.3.6"
@@ -1812,6 +1821,7 @@ dependencies = [
  "clap 4.1.11",
  "console",
  "convert_case 0.6.0",
+ "emojis",
  "flate2",
  "futures",
  "hashbrown 0.13.1",
@@ -1832,6 +1842,7 @@ dependencies = [
  "sha-1",
  "tokio",
  "twelf",
+ "unicode-segmentation",
  "wax",
 ]
 
@@ -1917,6 +1928,15 @@ dependencies = [
  "phf_shared 0.10.0",
 ]
 
+[[package]]
+name = "phf"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
+dependencies = [
+ "phf_shared 0.11.2",
+]
+
 [[package]]
 name = "phf_codegen"
 version = "0.8.0"
@@ -1989,6 +2009,15 @@ dependencies = [
  "siphasher",
 ]
 
+[[package]]
+name = "phf_shared"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
+dependencies = [
+ "siphasher",
+]
+
 [[package]]
 name = "pin-project-lite"
 version = "0.2.9"
@@ -2682,9 +2711,9 @@ dependencies = [
 
 [[package]]
 name = "unicode-segmentation"
-version = "1.10.0"
+version = "1.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"
+checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
 
 [[package]]
 name = "unicode-width"
diff --git a/pagefind/Cargo.toml b/pagefind/Cargo.toml
index 205267ea..5c7d6c56 100644
--- a/pagefind/Cargo.toml
+++ b/pagefind/Cargo.toml
@@ -50,6 +50,8 @@ pagefind_stem = { version = "0.2.0", features = [
 ] }
 convert_case = "0.6.0"
 charabia = { version = "0.7.0", optional = true }
+unicode-segmentation = "1.10.1"
+emojis = "0.6.1"
 hashbrown = { version = "0.13.1", features = ["serde"] }
 regex = "1.1"
 minicbor = { version = "0.19.1", features = ["alloc", "derive"] }
diff --git a/pagefind/features/characters.feature b/pagefind/features/characters.feature
index 8d48d861..bac06bd1 100644
--- a/pagefind/features/characters.feature
+++ b/pagefind/features/characters.feature
@@ -31,6 +31,34 @@ Feature: Character Tests
         Then There should be no logs
         Then The selector "[data-result]" should contain "/apiary/"
 
+    Scenario: Pagefind matches emoji
+        Given I have a "public/fam-seperate/index.html" file with the body:
+            """
+            <h1>Fam 👨‍👩‍👧‍👦</h1>
+            """
+        Given I have a "public/fam-middled/index.html" file with the body:
+            """
+            <h1>F👨‍👩‍👧‍👦am</h1>
+            """
+        When I run my program
+        Then I should see "Running Pagefind" in stdout
+        Then I should see the file "public/pagefind/pagefind.js"
+        When I serve the "public" directory
+        When I load "/"
+        When I evaluate:
+            """
+            async function() {
+                let pagefind = await import("/pagefind/pagefind.js");
+
+                let search = await pagefind.search("👨‍👩‍👧‍👦");
+
+                let pages = await Promise.all(search.results.map(r => r.data()));
+                document.querySelector('[data-result]').innerText = pages.map(p => p.url).sort().join(", ");
+            }
+            """
+        Then There should be no logs
+        Then The selector "[data-result]" should contain "/fam-middled/, /fam-seperate/"
+
     Scenario: Pagefind doesn't match HTML entities as their text
         Given I have a "public/apiary/index.html" file with the body:
             """
diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs
index 7c9fd60a..9d35ea00 100644
--- a/pagefind/src/fossick/mod.rs
+++ b/pagefind/src/fossick/mod.rs
@@ -332,13 +332,19 @@ impl Fossicker {
 
             // For words that may be CompoundWords, also index them as their constituent parts
             if normalized_word != word {
-                let parts = get_discrete_words(word);
+                let (word_parts, extras) = get_discrete_words(word);
                 // Only proceed if the word was broken into multiple parts
-                if parts.contains(|c: char| c.is_whitespace()) {
-                    for part_word in parts.split_whitespace() {
+                if word_parts.contains(|c: char| c.is_whitespace()) {
+                    for part_word in word_parts.split_whitespace() {
                         store_word(part_word, word_index, *word_weight);
                     }
                 }
+                // Additionally store any special extra characters we are given
+                if let Some(extras) = extras {
+                    for extra in extras {
+                        store_word(&extra, word_index, *word_weight);
+                    }
+                }
             }
         }
         if content.ends_with(' ') {
diff --git a/pagefind/src/fossick/splitting.rs b/pagefind/src/fossick/splitting.rs
index 71bbfc8b..a3275772 100644
--- a/pagefind/src/fossick/splitting.rs
+++ b/pagefind/src/fossick/splitting.rs
@@ -1,9 +1,38 @@
 use convert_case::{Case, Casing};
+use emojis;
+use lazy_static::lazy_static;
+use regex::Regex;
+use unicode_segmentation::UnicodeSegmentation;
 
-pub fn get_discrete_words<S: AsRef<str>>(s: S) -> String {
-    s.as_ref()
+lazy_static! {
+    static ref EMOJI: Regex = Regex::new("\\p{Emoji}").unwrap();
+}
+
+pub fn get_discrete_words<S: AsRef<str>>(s: S) -> (String, Option<Vec<String>>) {
+    let mut extras = None;
+
+    let words = s
+        .as_ref()
         .replace(|c| c == '.' || c == ',' || c == '/' || c == ':', " ")
-        .to_case(Case::Lower)
+        .to_case(Case::Lower);
+
+    if EMOJI.is_match(s.as_ref()) {
+        extras = Some(
+            s.as_ref()
+                .graphemes(true)
+                .into_iter()
+                .filter_map(|x| {
+                    if emojis::get(x).is_some() {
+                        Some(x.to_string())
+                    } else {
+                        None
+                    }
+                })
+                .collect::<Vec<_>>(),
+        );
+    }
+
+    (words, extras)
 }
 
 #[cfg(test)]
@@ -13,30 +42,57 @@ mod tests {
     #[test]
     fn hyphenated_words() {
         let input = "these-words-are-hyphenated";
-        assert_eq!(get_discrete_words(input), "these words are hyphenated");
+        assert_eq!(
+            get_discrete_words(input),
+            ("these words are hyphenated".into(), None)
+        );
     }
 
     #[test]
     fn underscored_words() {
         let input = "__array_structures";
-        assert_eq!(get_discrete_words(input), "array structures");
+        assert_eq!(get_discrete_words(input), ("array structures".into(), None));
     }
 
     #[test]
     fn camel_words() {
         let input = "WKWebVIEWComponent";
-        assert_eq!(get_discrete_words(input), "wk web view component");
+        assert_eq!(
+            get_discrete_words(input),
+            ("wk web view component".into(), None)
+        );
     }
 
     #[test]
     fn dotted_words() {
         let input = "page.Find";
-        assert_eq!(get_discrete_words(input), "page find");
+        assert_eq!(get_discrete_words(input), ("page find".into(), None));
     }
 
     #[test]
     fn misc_punctuation() {
         let input = "cloud/cannon,page.find";
-        assert_eq!(get_discrete_words(input), "cloud cannon page find");
+        assert_eq!(
+            get_discrete_words(input),
+            ("cloud cannon page find".into(), None)
+        );
+    }
+
+    #[test]
+    fn emoji() {
+        let input = "cloud🌦️cannon";
+        assert_eq!(
+            get_discrete_words(input),
+            ("cloud🌦️cannon".into(), Some(vec!["🌦️".into()]))
+        );
+
+        let input = "👋👨‍👩‍👧‍👦🌾";
+        assert_eq!(
+            get_discrete_words(input),
+            (
+                "👋👨‍👩‍👧‍👦🌾".into(),
+                Some(vec!["👋".into(), "👨‍👩‍👧‍👦".into(), "🌾".into()])
+            )
+        );
     }
 }