From b43f0877f83e832d23fa5c4a1292becc5b73665a Mon Sep 17 00:00:00 2001
From: Liam Bigelow <40188355+bglw@users.noreply.github.com>
Date: Wed, 6 Sep 2023 12:59:50 +1200
Subject: [PATCH] Index emoji in and around words
---
Cargo.lock | 33 ++++++++++++-
pagefind/Cargo.toml | 2 +
pagefind/features/characters.feature | 28 +++++++++++
pagefind/src/fossick/mod.rs | 12 +++--
pagefind/src/fossick/splitting.rs | 72 ++++++++++++++++++++++++----
5 files changed, 134 insertions(+), 13 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 1024689c..d00ba2b1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -771,6 +771,15 @@ version = "1.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797"
+[[package]]
+name = "emojis"
+version = "0.6.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ee61eb945bff65ee7d19d157d39c67c33290ff0742907413fd5eefd29edc979"
+dependencies = [
+ "phf 0.11.2",
+]
+
[[package]]
name = "encode_unicode"
version = "0.3.6"
@@ -1812,6 +1821,7 @@ dependencies = [
"clap 4.1.11",
"console",
"convert_case 0.6.0",
+ "emojis",
"flate2",
"futures",
"hashbrown 0.13.1",
@@ -1832,6 +1842,7 @@ dependencies = [
"sha-1",
"tokio",
"twelf",
+ "unicode-segmentation",
"wax",
]
@@ -1917,6 +1928,15 @@ dependencies = [
"phf_shared 0.10.0",
]
+[[package]]
+name = "phf"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ade2d8b8f33c7333b51bcf0428d37e217e9f32192ae4772156f65063b8ce03dc"
+dependencies = [
+ "phf_shared 0.11.2",
+]
+
[[package]]
name = "phf_codegen"
version = "0.8.0"
@@ -1989,6 +2009,15 @@ dependencies = [
"siphasher",
]
+[[package]]
+name = "phf_shared"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "90fcb95eef784c2ac79119d1dd819e162b5da872ce6f3c3abe1e8ca1c082f72b"
+dependencies = [
+ "siphasher",
+]
+
[[package]]
name = "pin-project-lite"
version = "0.2.9"
@@ -2682,9 +2711,9 @@ dependencies = [
[[package]]
name = "unicode-segmentation"
-version = "1.10.0"
+version = "1.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fdbf052a0783de01e944a6ce7a8cb939e295b1e7be835a1112c3b9a7f047a5a"
+checksum = "1dd624098567895118886609431a7c3b8f516e41d30e0643f03d94592a147e36"
[[package]]
name = "unicode-width"
diff --git a/pagefind/Cargo.toml b/pagefind/Cargo.toml
index 205267ea..5c7d6c56 100644
--- a/pagefind/Cargo.toml
+++ b/pagefind/Cargo.toml
@@ -50,6 +50,8 @@ pagefind_stem = { version = "0.2.0", features = [
] }
convert_case = "0.6.0"
charabia = { version = "0.7.0", optional = true }
+unicode-segmentation = "1.10.1"
+emojis = "0.6.1"
hashbrown = { version = "0.13.1", features = ["serde"] }
regex = "1.1"
minicbor = { version = "0.19.1", features = ["alloc", "derive"] }
diff --git a/pagefind/features/characters.feature b/pagefind/features/characters.feature
index 8d48d861..bac06bd1 100644
--- a/pagefind/features/characters.feature
+++ b/pagefind/features/characters.feature
@@ -31,6 +31,34 @@ Feature: Character Tests
Then There should be no logs
Then The selector "[data-result]" should contain "/apiary/"
+ Scenario: Pagefind matches emoji
+ Given I have a "public/fam-seperate/index.html" file with the body:
+ """
+
Fam π¨βπ©βπ§βπ¦
+ """
+ Given I have a "public/fam-middled/index.html" file with the body:
+ """
+ Fπ¨βπ©βπ§βπ¦am
+ """
+ When I run my program
+ Then I should see "Running Pagefind" in stdout
+ Then I should see the file "public/pagefind/pagefind.js"
+ When I serve the "public" directory
+ When I load "/"
+ When I evaluate:
+ """
+ async function() {
+ let pagefind = await import("/pagefind/pagefind.js");
+
+ let search = await pagefind.search("π¨βπ©βπ§βπ¦");
+
+ let pages = await Promise.all(search.results.map(r => r.data()));
+ document.querySelector('[data-result]').innerText = pages.map(p => p.url).sort().join(", ");
+ }
+ """
+ Then There should be no logs
+ Then The selector "[data-result]" should contain "/fam-middled/, /fam-seperate/"
+
Scenario: Pagefind doesn't match HTML entities as their text
Given I have a "public/apiary/index.html" file with the body:
"""
diff --git a/pagefind/src/fossick/mod.rs b/pagefind/src/fossick/mod.rs
index 7c9fd60a..9d35ea00 100644
--- a/pagefind/src/fossick/mod.rs
+++ b/pagefind/src/fossick/mod.rs
@@ -332,13 +332,19 @@ impl Fossicker {
// For words that may be CompoundWords, also index them as their constituent parts
if normalized_word != word {
- let parts = get_discrete_words(word);
+ let (word_parts, extras) = get_discrete_words(word);
// Only proceed if the word was broken into multiple parts
- if parts.contains(|c: char| c.is_whitespace()) {
- for part_word in parts.split_whitespace() {
+ if word_parts.contains(|c: char| c.is_whitespace()) {
+ for part_word in word_parts.split_whitespace() {
store_word(part_word, word_index, *word_weight);
}
}
+ // Additionally store any special extra characters we are given
+ if let Some(extras) = extras {
+ for extra in extras {
+ store_word(&extra, word_index, *word_weight);
+ }
+ }
}
}
if content.ends_with(' ') {
diff --git a/pagefind/src/fossick/splitting.rs b/pagefind/src/fossick/splitting.rs
index 71bbfc8b..a3275772 100644
--- a/pagefind/src/fossick/splitting.rs
+++ b/pagefind/src/fossick/splitting.rs
@@ -1,9 +1,38 @@
use convert_case::{Case, Casing};
+use emojis;
+use lazy_static::lazy_static;
+use regex::Regex;
+use unicode_segmentation::UnicodeSegmentation;
-pub fn get_discrete_words>(s: S) -> String {
- s.as_ref()
+lazy_static! {
+ static ref EMOJI: Regex = Regex::new("\\p{Emoji}").unwrap();
+}
+
+pub fn get_discrete_words>(s: S) -> (String, Option>) {
+ let mut extras = None;
+
+ let words = s
+ .as_ref()
.replace(|c| c == '.' || c == ',' || c == '/' || c == ':', " ")
- .to_case(Case::Lower)
+ .to_case(Case::Lower);
+
+ if EMOJI.is_match(s.as_ref()) {
+ extras = Some(
+ s.as_ref()
+ .graphemes(true)
+ .into_iter()
+ .filter_map(|x| {
+ if emojis::get(x).is_some() {
+ Some(x.to_string())
+ } else {
+ None
+ }
+ })
+ .collect::>(),
+ );
+ }
+
+ (words, extras)
}
#[cfg(test)]
@@ -13,30 +42,57 @@ mod tests {
#[test]
fn hyphenated_words() {
let input = "these-words-are-hyphenated";
- assert_eq!(get_discrete_words(input), "these words are hyphenated");
+ assert_eq!(
+ get_discrete_words(input),
+ ("these words are hyphenated".into(), None)
+ );
}
#[test]
fn underscored_words() {
let input = "__array_structures";
- assert_eq!(get_discrete_words(input), "array structures");
+ assert_eq!(get_discrete_words(input), ("array structures".into(), None));
}
#[test]
fn camel_words() {
let input = "WKWebVIEWComponent";
- assert_eq!(get_discrete_words(input), "wk web view component");
+ assert_eq!(
+ get_discrete_words(input),
+ ("wk web view component".into(), None)
+ );
}
#[test]
fn dotted_words() {
let input = "page.Find";
- assert_eq!(get_discrete_words(input), "page find");
+ assert_eq!(get_discrete_words(input), ("page find".into(), None));
}
#[test]
fn misc_punctuation() {
let input = "cloud/cannon,page.find";
- assert_eq!(get_discrete_words(input), "cloud cannon page find");
+ assert_eq!(
+ get_discrete_words(input),
+ ("cloud cannon page find".into(), None)
+ );
+ }
+
+ #[test]
+ fn emoji() {
+ let input = "cloudπ¦οΈcannon";
+ assert_eq!(
+ get_discrete_words(input),
+ ("cloudπ¦οΈcannon".into(), Some(vec!["π¦οΈ".into()]))
+ );
+
+ let input = "ππ¨βπ©βπ§βπ¦πΎ";
+ assert_eq!(
+ get_discrete_words(input),
+ (
+ "ππ¨βπ©βπ§βπ¦πΎ".into(),
+ Some(vec!["π".into(), "π¨βπ©βπ§βπ¦".into(), "πΎ".into()])
+ )
+ );
}
}