Skip to content

Commit

Permalink
Merge pull request #407 from CloudCannon/feat/emoji-search
Browse files Browse the repository at this point in the history
Index emoji in and around words
  • Loading branch information
bglw authored Sep 6, 2023
2 parents cb5de8d + b43f087 commit 631c586
Show file tree
Hide file tree
Showing 5 changed files with 134 additions and 13 deletions.
33 changes: 31 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions pagefind/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ pagefind_stem = { version = "0.2.0", features = [
] }
convert_case = "0.6.0"
charabia = { version = "0.7.0", optional = true }
unicode-segmentation = "1.10.1"
emojis = "0.6.1"
hashbrown = { version = "0.13.1", features = ["serde"] }
regex = "1.1"
minicbor = { version = "0.19.1", features = ["alloc", "derive"] }
Expand Down
28 changes: 28 additions & 0 deletions pagefind/features/characters.feature
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,34 @@ Feature: Character Tests
Then There should be no logs
Then The selector "[data-result]" should contain "/apiary/"

Scenario: Pagefind matches emoji
Given I have a "public/fam-seperate/index.html" file with the body:
"""
<h1>Fam 👨‍👩‍👧‍👦</h1>
"""
Given I have a "public/fam-middled/index.html" file with the body:
"""
<h1>F👨‍👩‍👧‍👦am</h1>
"""
When I run my program
Then I should see "Running Pagefind" in stdout
Then I should see the file "public/pagefind/pagefind.js"
When I serve the "public" directory
When I load "/"
When I evaluate:
"""
async function() {
let pagefind = await import("/pagefind/pagefind.js");
let search = await pagefind.search("👨‍👩‍👧‍👦");
let pages = await Promise.all(search.results.map(r => r.data()));
document.querySelector('[data-result]').innerText = pages.map(p => p.url).sort().join(", ");
}
"""
Then There should be no logs
Then The selector "[data-result]" should contain "/fam-middled/, /fam-seperate/"

Scenario: Pagefind doesn't match HTML entities as their text
Given I have a "public/apiary/index.html" file with the body:
"""
Expand Down
12 changes: 9 additions & 3 deletions pagefind/src/fossick/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -332,13 +332,19 @@ impl Fossicker {

// For words that may be CompoundWords, also index them as their constituent parts
if normalized_word != word {
let parts = get_discrete_words(word);
let (word_parts, extras) = get_discrete_words(word);
// Only proceed if the word was broken into multiple parts
if parts.contains(|c: char| c.is_whitespace()) {
for part_word in parts.split_whitespace() {
if word_parts.contains(|c: char| c.is_whitespace()) {
for part_word in word_parts.split_whitespace() {
store_word(part_word, word_index, *word_weight);
}
}
// Additionally store any special extra characters we are given
if let Some(extras) = extras {
for extra in extras {
store_word(&extra, word_index, *word_weight);
}
}
}
}
if content.ends_with(' ') {
Expand Down
72 changes: 64 additions & 8 deletions pagefind/src/fossick/splitting.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,38 @@
use convert_case::{Case, Casing};
use emojis;
use lazy_static::lazy_static;
use regex::Regex;
use unicode_segmentation::UnicodeSegmentation;

pub fn get_discrete_words<S: AsRef<str>>(s: S) -> String {
s.as_ref()
lazy_static! {
static ref EMOJI: Regex = Regex::new("\\p{Emoji}").unwrap();
}

pub fn get_discrete_words<S: AsRef<str>>(s: S) -> (String, Option<Vec<String>>) {
let mut extras = None;

let words = s
.as_ref()
.replace(|c| c == '.' || c == ',' || c == '/' || c == ':', " ")
.to_case(Case::Lower)
.to_case(Case::Lower);

if EMOJI.is_match(s.as_ref()) {
extras = Some(
s.as_ref()
.graphemes(true)
.into_iter()
.filter_map(|x| {
if emojis::get(x).is_some() {
Some(x.to_string())
} else {
None
}
})
.collect::<Vec<_>>(),
);
}

(words, extras)
}

#[cfg(test)]
Expand All @@ -13,30 +42,57 @@ mod tests {
#[test]
fn hyphenated_words() {
let input = "these-words-are-hyphenated";
assert_eq!(get_discrete_words(input), "these words are hyphenated");
assert_eq!(
get_discrete_words(input),
("these words are hyphenated".into(), None)
);
}

#[test]
fn underscored_words() {
let input = "__array_structures";
assert_eq!(get_discrete_words(input), "array structures");
assert_eq!(get_discrete_words(input), ("array structures".into(), None));
}

#[test]
fn camel_words() {
let input = "WKWebVIEWComponent";
assert_eq!(get_discrete_words(input), "wk web view component");
assert_eq!(
get_discrete_words(input),
("wk web view component".into(), None)
);
}

#[test]
fn dotted_words() {
let input = "page.Find";
assert_eq!(get_discrete_words(input), "page find");
assert_eq!(get_discrete_words(input), ("page find".into(), None));
}

#[test]
fn misc_punctuation() {
let input = "cloud/cannon,page.find";
assert_eq!(get_discrete_words(input), "cloud cannon page find");
assert_eq!(
get_discrete_words(input),
("cloud cannon page find".into(), None)
);
}

#[test]
fn emoji() {
let input = "cloud🌦️cannon";
assert_eq!(
get_discrete_words(input),
("cloud🌦️cannon".into(), Some(vec!["🌦️".into()]))
);

let input = "👋👨‍👩‍👧‍👦🌾";
assert_eq!(
get_discrete_words(input),
(
"👋👨‍👩‍👧‍👦🌾".into(),
Some(vec!["👋".into(), "👨‍👩‍👧‍👦".into(), "🌾".into()])
)
);
}
}

0 comments on commit 631c586

Please sign in to comment.